diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_indices/8e89af82-5757-47a3-b0c3-49f88825aa3d/index.idx b/.lancedb/content_aware_chunking_BAAI.lance/_indices/8e89af82-5757-47a3-b0c3-49f88825aa3d/index.idx new file mode 100644 index 0000000000000000000000000000000000000000..fac0ced5b71ca6bdbfb3937048c570b1046e6550 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/_indices/8e89af82-5757-47a3-b0c3-49f88825aa3d/index.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7693146fec515d8d3c3cce840c8841d05395ed8549ff581ff8db48069d1b5eea +size 2398061 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_latest.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_latest.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c9f969187903ffd838d342e2e36a02b9a2b9461c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_latest.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/0-8cb90146-e8f0-40f4-adb1-a91b96920237.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/0-8cb90146-e8f0-40f4-adb1-a91b96920237.txn new file mode 100644 index 0000000000000000000000000000000000000000..89b7b4cad959fa0468c3ce614fb3b046d37b0b3f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/0-8cb90146-e8f0-40f4-adb1-a91b96920237.txn @@ -0,0 +1 @@ +$8cb90146-e8f0-40f4-adb1-a91b96920237²V3vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:102408text ÿÿÿÿÿÿÿÿÿ*string08 \ No newline at end of file diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/1-04c71273-da7d-41b5-842e-07a7cd56b714.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/1-04c71273-da7d-41b5-842e-07a7cd56b714.txn new file mode 100644 index 0000000000000000000000000000000000000000..ca29db80233bc5bac7dad3972cb1defa08513d96 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/1-04c71273-da7d-41b5-842e-07a7cd56b714.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/10-9eeeab3d-dadf-4090-9130-365b08165ceb.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/10-9eeeab3d-dadf-4090-9130-365b08165ceb.txn new file mode 100644 index 0000000000000000000000000000000000000000..f8ced341f26494a8121126233b434f7c5e28e302 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/10-9eeeab3d-dadf-4090-9130-365b08165ceb.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/100-a6f91f0c-2db3-4e57-a4c1-37d03b1e9391.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/100-a6f91f0c-2db3-4e57-a4c1-37d03b1e9391.txn new file mode 100644 index 0000000000000000000000000000000000000000..3424c67129880e940c29b2cc654510565ff477d7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/100-a6f91f0c-2db3-4e57-a4c1-37d03b1e9391.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/101-9aecfac5-9c4a-4d7e-bcbb-bdf277804d86.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/101-9aecfac5-9c4a-4d7e-bcbb-bdf277804d86.txn new file mode 100644 index 0000000000000000000000000000000000000000..4e52990309f64307bf8fdbba70aaaeb8eae00251 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/101-9aecfac5-9c4a-4d7e-bcbb-bdf277804d86.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/102-eac3dc26-70ce-43ce-a622-114fa75b137c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/102-eac3dc26-70ce-43ce-a622-114fa75b137c.txn new file mode 100644 index 0000000000000000000000000000000000000000..d579ec009d04e5a282cd0e8b1fef3da76206ef93 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/102-eac3dc26-70ce-43ce-a622-114fa75b137c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/103-997996f5-eeda-4f83-9a03-916a38c2dbb4.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/103-997996f5-eeda-4f83-9a03-916a38c2dbb4.txn new file mode 100644 index 0000000000000000000000000000000000000000..c0c891ccb8da9ecdc499c5e99ce3f35a605e9c45 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/103-997996f5-eeda-4f83-9a03-916a38c2dbb4.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/104-7624f14d-fe63-422a-a484-89afff04e8b7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/104-7624f14d-fe63-422a-a484-89afff04e8b7.txn new file mode 100644 index 0000000000000000000000000000000000000000..87fe5a4d7f65ea9d66bbdd09f9a864a744bcfb1f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/104-7624f14d-fe63-422a-a484-89afff04e8b7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/105-6260c683-fac3-4a45-839b-ca13166341f5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/105-6260c683-fac3-4a45-839b-ca13166341f5.txn new file mode 100644 index 0000000000000000000000000000000000000000..ab92d8384e4c8b4e5b86196265792e32358dca3e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/105-6260c683-fac3-4a45-839b-ca13166341f5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/106-84a625a1-d70b-4eb8-a3a4-69117a257320.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/106-84a625a1-d70b-4eb8-a3a4-69117a257320.txn new file mode 100644 index 0000000000000000000000000000000000000000..bbee20dab29d981d1ed09c53b25be32217f4a1ed Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/106-84a625a1-d70b-4eb8-a3a4-69117a257320.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/107-24cbc301-6541-4768-bec9-1f4916e39379.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/107-24cbc301-6541-4768-bec9-1f4916e39379.txn new file mode 100644 index 0000000000000000000000000000000000000000..42bbe356aa2bac634e1a3d9294dfa50b8a7dcb06 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/107-24cbc301-6541-4768-bec9-1f4916e39379.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/108-0d55fb5a-eda0-43cd-9187-3141346f853e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/108-0d55fb5a-eda0-43cd-9187-3141346f853e.txn new file mode 100644 index 0000000000000000000000000000000000000000..31701a3de7841bfa01f86bae007ee9a0109dade7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/108-0d55fb5a-eda0-43cd-9187-3141346f853e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/109-fa66564b-1d6d-4bf2-b1ea-d8dbaf8c6850.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/109-fa66564b-1d6d-4bf2-b1ea-d8dbaf8c6850.txn new file mode 100644 index 0000000000000000000000000000000000000000..58bab7a8119ef6d9c09ad86a0d84cfb4f0f96f4f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/109-fa66564b-1d6d-4bf2-b1ea-d8dbaf8c6850.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/11-d0338d87-e35f-4b9a-8911-283f15b48942.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/11-d0338d87-e35f-4b9a-8911-283f15b48942.txn new file mode 100644 index 0000000000000000000000000000000000000000..ae9bf129dd969ba0a5556f5d5a45fd6ea47824c5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/11-d0338d87-e35f-4b9a-8911-283f15b48942.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/110-ee0b8f50-63ca-4737-90d5-8b9fe409aac3.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/110-ee0b8f50-63ca-4737-90d5-8b9fe409aac3.txn new file mode 100644 index 0000000000000000000000000000000000000000..926fec151d385f9e155ff3d1f75c4370e3f3efc3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/110-ee0b8f50-63ca-4737-90d5-8b9fe409aac3.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/111-de25a868-2538-434d-b36c-65506ba45c1c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/111-de25a868-2538-434d-b36c-65506ba45c1c.txn new file mode 100644 index 0000000000000000000000000000000000000000..9658ac8b1acdb8dd7ec0bb6061b95faa96717413 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/111-de25a868-2538-434d-b36c-65506ba45c1c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/112-bd77d89c-4059-48cf-8d18-98d25527b083.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/112-bd77d89c-4059-48cf-8d18-98d25527b083.txn new file mode 100644 index 0000000000000000000000000000000000000000..83f5fd4a45f419cf56e1b99a63073230dd2bbcae Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/112-bd77d89c-4059-48cf-8d18-98d25527b083.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/113-86bd7c47-2974-495f-a55b-dc16e95802b5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/113-86bd7c47-2974-495f-a55b-dc16e95802b5.txn new file mode 100644 index 0000000000000000000000000000000000000000..03385d4a9d50d77f633f5d2c0e9b2cf343f4b0c3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/113-86bd7c47-2974-495f-a55b-dc16e95802b5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/114-6afa8599-071f-4d5e-984d-c0d1dc83a49d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/114-6afa8599-071f-4d5e-984d-c0d1dc83a49d.txn new file mode 100644 index 0000000000000000000000000000000000000000..597ce87caea25436aea7409150adcd654a846902 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/114-6afa8599-071f-4d5e-984d-c0d1dc83a49d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/115-40c50ec2-0265-4518-8c85-233770687f89.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/115-40c50ec2-0265-4518-8c85-233770687f89.txn new file mode 100644 index 0000000000000000000000000000000000000000..6ba98bf014d33d59c65a52653e6f099bca649190 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/115-40c50ec2-0265-4518-8c85-233770687f89.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/116-8bf3e2e3-74dc-4e93-b3a6-79ce559f4f5d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/116-8bf3e2e3-74dc-4e93-b3a6-79ce559f4f5d.txn new file mode 100644 index 0000000000000000000000000000000000000000..026f886889639ac62f9fee2c7148dd57997396af Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/116-8bf3e2e3-74dc-4e93-b3a6-79ce559f4f5d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/117-d7000b5b-d9bd-4fe5-9ccc-eea48c5f0207.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/117-d7000b5b-d9bd-4fe5-9ccc-eea48c5f0207.txn new file mode 100644 index 0000000000000000000000000000000000000000..87ea43831ea0007108029583d96ec791f8d5930c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/117-d7000b5b-d9bd-4fe5-9ccc-eea48c5f0207.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/118-420388d5-06b5-4413-8471-2d3c2368b0d8.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/118-420388d5-06b5-4413-8471-2d3c2368b0d8.txn new file mode 100644 index 0000000000000000000000000000000000000000..5b0d7c767acd43475663151d93b08beb185dd38d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/118-420388d5-06b5-4413-8471-2d3c2368b0d8.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/119-561d80cb-3ba3-42fb-a9da-6c358dba4ba5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/119-561d80cb-3ba3-42fb-a9da-6c358dba4ba5.txn new file mode 100644 index 0000000000000000000000000000000000000000..6729bea0099c46b52db14e0ea64d53e363336540 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/119-561d80cb-3ba3-42fb-a9da-6c358dba4ba5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/12-39506c18-891c-488c-b935-7ede34e5a32e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/12-39506c18-891c-488c-b935-7ede34e5a32e.txn new file mode 100644 index 0000000000000000000000000000000000000000..746dbf13e97b2ded24a71daef31220e2e8455aa4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/12-39506c18-891c-488c-b935-7ede34e5a32e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/120-4e85eea0-5544-4f33-bf39-d53894322199.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/120-4e85eea0-5544-4f33-bf39-d53894322199.txn new file mode 100644 index 0000000000000000000000000000000000000000..4eb868c13060866b47c45f72b0e4bf1550510689 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/120-4e85eea0-5544-4f33-bf39-d53894322199.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/121-197c0b91-0354-4e6c-9a1f-de8972e72dd5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/121-197c0b91-0354-4e6c-9a1f-de8972e72dd5.txn new file mode 100644 index 0000000000000000000000000000000000000000..bacfa534ea226c86c8a566a4cb67be06427024b7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/121-197c0b91-0354-4e6c-9a1f-de8972e72dd5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/122-7947e9a5-fbed-416e-ae44-e16003842a48.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/122-7947e9a5-fbed-416e-ae44-e16003842a48.txn new file mode 100644 index 0000000000000000000000000000000000000000..098d228bef3c5b246aa58658ef10bf75cec20833 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/122-7947e9a5-fbed-416e-ae44-e16003842a48.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/123-4054220d-3cbd-4bf0-863d-3827f8a87c66.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/123-4054220d-3cbd-4bf0-863d-3827f8a87c66.txn new file mode 100644 index 0000000000000000000000000000000000000000..80cf1dde8e6c035476d1b75ed33844332c277448 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/123-4054220d-3cbd-4bf0-863d-3827f8a87c66.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/124-925afcd2-2554-4df6-a9a6-0d965e2ddc40.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/124-925afcd2-2554-4df6-a9a6-0d965e2ddc40.txn new file mode 100644 index 0000000000000000000000000000000000000000..ff2f481fd43ec269b47be02b704fe32931114cb5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/124-925afcd2-2554-4df6-a9a6-0d965e2ddc40.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/125-d4eae214-2e9e-4283-9539-989bf50d8beb.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/125-d4eae214-2e9e-4283-9539-989bf50d8beb.txn new file mode 100644 index 0000000000000000000000000000000000000000..8a611c6ba9b535413cf9368eb63a161b0292c750 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/125-d4eae214-2e9e-4283-9539-989bf50d8beb.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/126-ceb5d4c7-802c-462b-8ec0-33aef4ee0c65.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/126-ceb5d4c7-802c-462b-8ec0-33aef4ee0c65.txn new file mode 100644 index 0000000000000000000000000000000000000000..d8539bb58f7c27fba46df3e11ece99b7d9cb5e2f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/126-ceb5d4c7-802c-462b-8ec0-33aef4ee0c65.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/127-98a56afd-7b9f-42e6-9967-019011bf4ff8.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/127-98a56afd-7b9f-42e6-9967-019011bf4ff8.txn new file mode 100644 index 0000000000000000000000000000000000000000..8c435be056ba0aa4c48670238eea3d69415ef9e5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/127-98a56afd-7b9f-42e6-9967-019011bf4ff8.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/128-dcbe1f3a-131f-49bf-af59-840b2d0c54ae.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/128-dcbe1f3a-131f-49bf-af59-840b2d0c54ae.txn new file mode 100644 index 0000000000000000000000000000000000000000..0a5d021ee19f04697ae8b2de7d1cc1e85e64edf9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/128-dcbe1f3a-131f-49bf-af59-840b2d0c54ae.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/129-732eb0fb-88cd-4d83-925d-88949dcea63e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/129-732eb0fb-88cd-4d83-925d-88949dcea63e.txn new file mode 100644 index 0000000000000000000000000000000000000000..492b387c8f517717e21171c265638f133251a1c9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/129-732eb0fb-88cd-4d83-925d-88949dcea63e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/13-6c362494-4957-438c-8f6d-bbab2b56f6da.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/13-6c362494-4957-438c-8f6d-bbab2b56f6da.txn new file mode 100644 index 0000000000000000000000000000000000000000..e9c2023bd9c0a9827991b11689b5aff521afe89d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/13-6c362494-4957-438c-8f6d-bbab2b56f6da.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/130-6020ae15-a2c6-4db9-9d31-64f4882c9421.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/130-6020ae15-a2c6-4db9-9d31-64f4882c9421.txn new file mode 100644 index 0000000000000000000000000000000000000000..6b87032f9ff56b5921ef7657c2e5b35ac8bdc227 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/130-6020ae15-a2c6-4db9-9d31-64f4882c9421.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/131-29aab7f9-0aa8-445e-b84e-1d004f718fe7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/131-29aab7f9-0aa8-445e-b84e-1d004f718fe7.txn new file mode 100644 index 0000000000000000000000000000000000000000..b1e322d5848a3c672f867edca96d838363eaab15 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/131-29aab7f9-0aa8-445e-b84e-1d004f718fe7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/132-c018b5a8-95d4-4e48-a7f6-d603827327c0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/132-c018b5a8-95d4-4e48-a7f6-d603827327c0.txn new file mode 100644 index 0000000000000000000000000000000000000000..129fe24b2f19b4aab9f7a78bccd2a65b184cfd25 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/132-c018b5a8-95d4-4e48-a7f6-d603827327c0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/133-c9523d45-5d89-4a23-85c5-0691f159f877.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/133-c9523d45-5d89-4a23-85c5-0691f159f877.txn new file mode 100644 index 0000000000000000000000000000000000000000..24b21f6e97b5702baa4356791dbf07a15a6d9b5b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/133-c9523d45-5d89-4a23-85c5-0691f159f877.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/134-957d07a0-e01d-4b84-b029-6fcc862d60c7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/134-957d07a0-e01d-4b84-b029-6fcc862d60c7.txn new file mode 100644 index 0000000000000000000000000000000000000000..917e433f9d1ce458fe2d51fb76898896e2d68a8e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/134-957d07a0-e01d-4b84-b029-6fcc862d60c7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/135-bd3ff13f-7c83-410e-8c0b-ff3d3aaf13dc.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/135-bd3ff13f-7c83-410e-8c0b-ff3d3aaf13dc.txn new file mode 100644 index 0000000000000000000000000000000000000000..8df3f29741f30535324d3973173cd41c97525be6 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/135-bd3ff13f-7c83-410e-8c0b-ff3d3aaf13dc.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/136-fd0d4487-2c43-4eb5-b0af-20c0980c750c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/136-fd0d4487-2c43-4eb5-b0af-20c0980c750c.txn new file mode 100644 index 0000000000000000000000000000000000000000..73299a48fcaddf8effeed994a22998a4fe5c22b7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/136-fd0d4487-2c43-4eb5-b0af-20c0980c750c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/137-7e0c7279-7c7d-49a5-bc8e-00174389ede7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/137-7e0c7279-7c7d-49a5-bc8e-00174389ede7.txn new file mode 100644 index 0000000000000000000000000000000000000000..410662088473e14f1d8087cc904aeed31e9d73aa Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/137-7e0c7279-7c7d-49a5-bc8e-00174389ede7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/138-a21c7f61-5f4f-43cf-9dab-e3cea44f09e1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/138-a21c7f61-5f4f-43cf-9dab-e3cea44f09e1.txn new file mode 100644 index 0000000000000000000000000000000000000000..c39b18fa0964eed2c26667422b5f82a7c3d99100 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/138-a21c7f61-5f4f-43cf-9dab-e3cea44f09e1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/139-7b1e8d34-a907-4767-a4ac-fd646362ea15.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/139-7b1e8d34-a907-4767-a4ac-fd646362ea15.txn new file mode 100644 index 0000000000000000000000000000000000000000..b24588f33224a2d594c199066010cd3ec410492b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/139-7b1e8d34-a907-4767-a4ac-fd646362ea15.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/14-06ae5211-224e-4a24-a57e-f1b47194eeb1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/14-06ae5211-224e-4a24-a57e-f1b47194eeb1.txn new file mode 100644 index 0000000000000000000000000000000000000000..d89d862ea8fb1b0816ae78f41c261f3fbed0ae1c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/14-06ae5211-224e-4a24-a57e-f1b47194eeb1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/140-b302eba0-0cc5-45d4-b4e4-c5e23e77670b.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/140-b302eba0-0cc5-45d4-b4e4-c5e23e77670b.txn new file mode 100644 index 0000000000000000000000000000000000000000..b52bb59fdc78da05dbc684597302b63a1c54c698 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/140-b302eba0-0cc5-45d4-b4e4-c5e23e77670b.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/141-3b14082b-d034-4d6d-b4f8-4bc2dc2707ac.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/141-3b14082b-d034-4d6d-b4f8-4bc2dc2707ac.txn new file mode 100644 index 0000000000000000000000000000000000000000..d14d530c023137dc12de37e0021b9cf520196084 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/141-3b14082b-d034-4d6d-b4f8-4bc2dc2707ac.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/142-d5aec3eb-5dba-42d1-9712-510da00088b6.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/142-d5aec3eb-5dba-42d1-9712-510da00088b6.txn new file mode 100644 index 0000000000000000000000000000000000000000..ef9fdfe5c37c4161c7a84187ed4907b4e9ed0a4a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/142-d5aec3eb-5dba-42d1-9712-510da00088b6.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/143-d7b28bf3-e3f1-4740-a1fa-164fcfbad765.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/143-d7b28bf3-e3f1-4740-a1fa-164fcfbad765.txn new file mode 100644 index 0000000000000000000000000000000000000000..426caebab4251630e31f71b7b6f780969a5d96aa Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/143-d7b28bf3-e3f1-4740-a1fa-164fcfbad765.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/144-a3f1a39d-0181-431c-997c-fb80c524e634.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/144-a3f1a39d-0181-431c-997c-fb80c524e634.txn new file mode 100644 index 0000000000000000000000000000000000000000..e575b4f3f4adbc6db04462f6c52fd60ea4c89a73 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/144-a3f1a39d-0181-431c-997c-fb80c524e634.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/145-597d39d0-4f76-43b3-9a19-e5da37610908.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/145-597d39d0-4f76-43b3-9a19-e5da37610908.txn new file mode 100644 index 0000000000000000000000000000000000000000..9d62ee039562d73b449f8769dc98012b2f559259 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/145-597d39d0-4f76-43b3-9a19-e5da37610908.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/146-9d9c5047-8b43-47b1-b2a3-fa4e5a602469.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/146-9d9c5047-8b43-47b1-b2a3-fa4e5a602469.txn new file mode 100644 index 0000000000000000000000000000000000000000..c90d23786b5c3a92e6d6e051f30a8844a08b5c11 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/146-9d9c5047-8b43-47b1-b2a3-fa4e5a602469.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/147-4afa5b67-1e2a-4254-8470-96dabee55c55.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/147-4afa5b67-1e2a-4254-8470-96dabee55c55.txn new file mode 100644 index 0000000000000000000000000000000000000000..310bdb31df906fb9cf003b1800a298768d09efdf Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/147-4afa5b67-1e2a-4254-8470-96dabee55c55.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/148-a6bf9fba-8fd8-4189-b114-f83513713448.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/148-a6bf9fba-8fd8-4189-b114-f83513713448.txn new file mode 100644 index 0000000000000000000000000000000000000000..d2963212096d566a4874e6f5440ec4b2162387af Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/148-a6bf9fba-8fd8-4189-b114-f83513713448.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/149-46df0621-11fc-4895-8c0f-0de07eeebfd7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/149-46df0621-11fc-4895-8c0f-0de07eeebfd7.txn new file mode 100644 index 0000000000000000000000000000000000000000..0a3e31114bea53a26ee2493620ee9febc2610980 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/149-46df0621-11fc-4895-8c0f-0de07eeebfd7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/15-fd8ef924-417d-4254-9f5d-a0d402731a5c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/15-fd8ef924-417d-4254-9f5d-a0d402731a5c.txn new file mode 100644 index 0000000000000000000000000000000000000000..9d9099f543a3a38ff9961011ab58ec57683dd5ad Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/15-fd8ef924-417d-4254-9f5d-a0d402731a5c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/150-cf5c70ef-835a-4936-aca9-a7aa36ed4d58.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/150-cf5c70ef-835a-4936-aca9-a7aa36ed4d58.txn new file mode 100644 index 0000000000000000000000000000000000000000..a8a2d3834f9edcd6d9232063dcae713b46bec2d5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/150-cf5c70ef-835a-4936-aca9-a7aa36ed4d58.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/151-8b4efc82-505b-411e-9050-c84e4561c364.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/151-8b4efc82-505b-411e-9050-c84e4561c364.txn new file mode 100644 index 0000000000000000000000000000000000000000..94ccdaf1a53f4aa28f40ae57cced7b84cf0513dd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/151-8b4efc82-505b-411e-9050-c84e4561c364.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/152-f63cc2af-5f6e-428a-bb3e-df6936d834ae.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/152-f63cc2af-5f6e-428a-bb3e-df6936d834ae.txn new file mode 100644 index 0000000000000000000000000000000000000000..61b8589a81a8b2f8d8ed06ba4285e28934f71023 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/152-f63cc2af-5f6e-428a-bb3e-df6936d834ae.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/153-27120a2a-56f4-43d7-8be6-2c169e456e76.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/153-27120a2a-56f4-43d7-8be6-2c169e456e76.txn new file mode 100644 index 0000000000000000000000000000000000000000..2cb46c8df5057f967ae90b4f781c69624cc919fb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/153-27120a2a-56f4-43d7-8be6-2c169e456e76.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/154-2c20d2da-3748-49a1-927a-ef929b3ecf05.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/154-2c20d2da-3748-49a1-927a-ef929b3ecf05.txn new file mode 100644 index 0000000000000000000000000000000000000000..8cde8fdbd02c017429c3774ef931fff53290fb9a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/154-2c20d2da-3748-49a1-927a-ef929b3ecf05.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/155-213071b1-e715-44dc-8ab9-7df4ed0a03d0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/155-213071b1-e715-44dc-8ab9-7df4ed0a03d0.txn new file mode 100644 index 0000000000000000000000000000000000000000..5a87747619865bc848ed19b358fa9d92df418355 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/155-213071b1-e715-44dc-8ab9-7df4ed0a03d0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/156-b40e0121-7120-4a0f-aa76-c0b6f0f5dc8e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/156-b40e0121-7120-4a0f-aa76-c0b6f0f5dc8e.txn new file mode 100644 index 0000000000000000000000000000000000000000..70ff2676d8f8f929b169a100316dd8a81653db93 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/156-b40e0121-7120-4a0f-aa76-c0b6f0f5dc8e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/157-88c92097-c720-4ed4-85a8-7ac68b3ba09d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/157-88c92097-c720-4ed4-85a8-7ac68b3ba09d.txn new file mode 100644 index 0000000000000000000000000000000000000000..4b63057a18d9e6a48f2c5298e0985c6ea32fdd8b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/157-88c92097-c720-4ed4-85a8-7ac68b3ba09d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/158-1b353638-6dd8-40a8-aa51-762ca4c02914.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/158-1b353638-6dd8-40a8-aa51-762ca4c02914.txn new file mode 100644 index 0000000000000000000000000000000000000000..c5fa07b92ebf96e29126bae23cfa04f0c0450ab0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/158-1b353638-6dd8-40a8-aa51-762ca4c02914.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/159-b87c0e96-946c-4f21-b07e-84c21bcb54e0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/159-b87c0e96-946c-4f21-b07e-84c21bcb54e0.txn new file mode 100644 index 0000000000000000000000000000000000000000..a1fc99f3bf3a6e8096f050079c07462cae28a3dd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/159-b87c0e96-946c-4f21-b07e-84c21bcb54e0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/16-c3200cec-08bc-4968-a621-23cfd03cb926.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/16-c3200cec-08bc-4968-a621-23cfd03cb926.txn new file mode 100644 index 0000000000000000000000000000000000000000..3cd80f873f9bdf231446094253cc42cdc5e1e6fa Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/16-c3200cec-08bc-4968-a621-23cfd03cb926.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/160-71e43823-97d8-4127-8299-5353c205dcf5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/160-71e43823-97d8-4127-8299-5353c205dcf5.txn new file mode 100644 index 0000000000000000000000000000000000000000..3b08fa0b868a7e8b01d274a5b585f7b1e045e9ec Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/160-71e43823-97d8-4127-8299-5353c205dcf5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/161-3c6dc5d7-f599-46da-8e6d-bbecf61b3d7e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/161-3c6dc5d7-f599-46da-8e6d-bbecf61b3d7e.txn new file mode 100644 index 0000000000000000000000000000000000000000..fb727ed1c8f182b9a01618ac44972422994e2bd8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/161-3c6dc5d7-f599-46da-8e6d-bbecf61b3d7e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/162-00a09543-e119-4145-83ef-9bf1b4e3166c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/162-00a09543-e119-4145-83ef-9bf1b4e3166c.txn new file mode 100644 index 0000000000000000000000000000000000000000..8e183ba8c8ac40c15001aafeb6de5ec7b28ed19d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/162-00a09543-e119-4145-83ef-9bf1b4e3166c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/163-8ca8b473-51ad-4620-a668-2a9b6382d839.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/163-8ca8b473-51ad-4620-a668-2a9b6382d839.txn new file mode 100644 index 0000000000000000000000000000000000000000..5c6f5654d4085c945499b2a905ee08aefedaf1ec Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/163-8ca8b473-51ad-4620-a668-2a9b6382d839.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/164-deb3ff59-c8f4-4cc3-8c71-26e51e340f87.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/164-deb3ff59-c8f4-4cc3-8c71-26e51e340f87.txn new file mode 100644 index 0000000000000000000000000000000000000000..b939876ce2082b7d8a4f12c1b8d8306f945a7a57 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/164-deb3ff59-c8f4-4cc3-8c71-26e51e340f87.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/165-4db17e57-1224-4302-b3d4-d7dc6078d91a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/165-4db17e57-1224-4302-b3d4-d7dc6078d91a.txn new file mode 100644 index 0000000000000000000000000000000000000000..0c41f52c2e929a56d9d5d30903fef1e46c5a7a93 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/165-4db17e57-1224-4302-b3d4-d7dc6078d91a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/166-42716b25-6c7c-4cf3-bd4c-37b7ce065c82.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/166-42716b25-6c7c-4cf3-bd4c-37b7ce065c82.txn new file mode 100644 index 0000000000000000000000000000000000000000..71d86e32705a7a81fe2215413912229e39c45b7d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/166-42716b25-6c7c-4cf3-bd4c-37b7ce065c82.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/167-222025ed-60db-4401-a834-1eab4f97738c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/167-222025ed-60db-4401-a834-1eab4f97738c.txn new file mode 100644 index 0000000000000000000000000000000000000000..c681558d32d77adcf8efc420fd57b52e0eb6785f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/167-222025ed-60db-4401-a834-1eab4f97738c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/168-99ab673f-1482-4146-a29b-f029335945f1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/168-99ab673f-1482-4146-a29b-f029335945f1.txn new file mode 100644 index 0000000000000000000000000000000000000000..bd7bbaca9fcdd70df5e4eadb579fdba35f4b45b1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/168-99ab673f-1482-4146-a29b-f029335945f1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/169-581154b5-425e-469f-8491-e0e576a2e28d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/169-581154b5-425e-469f-8491-e0e576a2e28d.txn new file mode 100644 index 0000000000000000000000000000000000000000..dc942532b3472f72a4843e190d152f45e5c168a3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/169-581154b5-425e-469f-8491-e0e576a2e28d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/17-69680abf-6dc7-45b6-a56d-ea804fc9c38c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/17-69680abf-6dc7-45b6-a56d-ea804fc9c38c.txn new file mode 100644 index 0000000000000000000000000000000000000000..131cb4a7f6413a45bd985a3040ded501a2b1d1da Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/17-69680abf-6dc7-45b6-a56d-ea804fc9c38c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/170-fb3454d2-f39d-4c64-a0eb-a6f5bb1ec1a3.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/170-fb3454d2-f39d-4c64-a0eb-a6f5bb1ec1a3.txn new file mode 100644 index 0000000000000000000000000000000000000000..584fc03f23eb99e9679ac8313b726674ff3d8b3a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/170-fb3454d2-f39d-4c64-a0eb-a6f5bb1ec1a3.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/171-d34f77fc-4db3-48af-9416-2947fc56a619.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/171-d34f77fc-4db3-48af-9416-2947fc56a619.txn new file mode 100644 index 0000000000000000000000000000000000000000..77a6ddd50a3c1dce5d3fea35b8cb02a7c4335425 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/171-d34f77fc-4db3-48af-9416-2947fc56a619.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/172-b2660b2a-97b9-480c-bb3a-891498f6503f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/172-b2660b2a-97b9-480c-bb3a-891498f6503f.txn new file mode 100644 index 0000000000000000000000000000000000000000..36ca499ad93bd942251c4ec6df5e29686f367c35 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/172-b2660b2a-97b9-480c-bb3a-891498f6503f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/173-5e4f18d3-e0c8-41ef-90db-ddc27567a7ad.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/173-5e4f18d3-e0c8-41ef-90db-ddc27567a7ad.txn new file mode 100644 index 0000000000000000000000000000000000000000..d96fef0513fa3c3e2ee973cd3af07b3cc2111587 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/173-5e4f18d3-e0c8-41ef-90db-ddc27567a7ad.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/174-1c4bc64c-329c-4790-9c5a-1ba583a500de.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/174-1c4bc64c-329c-4790-9c5a-1ba583a500de.txn new file mode 100644 index 0000000000000000000000000000000000000000..c6c1ccd8cd387c845b2be49d7bf5d8f2c8a5fa94 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/174-1c4bc64c-329c-4790-9c5a-1ba583a500de.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/175-09ce2368-776d-43d5-ad9a-8eccd20ed439.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/175-09ce2368-776d-43d5-ad9a-8eccd20ed439.txn new file mode 100644 index 0000000000000000000000000000000000000000..5d74ff6d4988a539073fb8772e25050fdc00adff Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/175-09ce2368-776d-43d5-ad9a-8eccd20ed439.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/176-cf5da463-3a8f-47ca-bbc2-d08e94b8dcd5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/176-cf5da463-3a8f-47ca-bbc2-d08e94b8dcd5.txn new file mode 100644 index 0000000000000000000000000000000000000000..7d06229638f318981c0f2e0528fe23314eebc6f3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/176-cf5da463-3a8f-47ca-bbc2-d08e94b8dcd5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/177-a0867ab3-01a0-4be0-8b61-7b545b102b38.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/177-a0867ab3-01a0-4be0-8b61-7b545b102b38.txn new file mode 100644 index 0000000000000000000000000000000000000000..9bf69d4bafae424dd72c35c4c5ee40f58439af36 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/177-a0867ab3-01a0-4be0-8b61-7b545b102b38.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/178-6a174c5d-1912-4a88-8258-48ca229dc683.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/178-6a174c5d-1912-4a88-8258-48ca229dc683.txn new file mode 100644 index 0000000000000000000000000000000000000000..9203b26d7a7aa8520878c30b2b0faf2cc0fc213d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/178-6a174c5d-1912-4a88-8258-48ca229dc683.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/179-e350c74d-01a7-45f9-8b9b-49d9e65d03eb.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/179-e350c74d-01a7-45f9-8b9b-49d9e65d03eb.txn new file mode 100644 index 0000000000000000000000000000000000000000..f0906966238c84a86fae18188d9ac7911c91ccda Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/179-e350c74d-01a7-45f9-8b9b-49d9e65d03eb.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/18-b7f08ca9-6784-4425-8c9e-967c840fbb65.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/18-b7f08ca9-6784-4425-8c9e-967c840fbb65.txn new file mode 100644 index 0000000000000000000000000000000000000000..47fa75b24c32cb7cfdb105e02a242c89d4f70ba8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/18-b7f08ca9-6784-4425-8c9e-967c840fbb65.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/180-55bf932a-4ab4-4074-8c3f-cd994b7d19da.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/180-55bf932a-4ab4-4074-8c3f-cd994b7d19da.txn new file mode 100644 index 0000000000000000000000000000000000000000..22699e0205d3d704a010bd11dcdcd9cd040457a2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/180-55bf932a-4ab4-4074-8c3f-cd994b7d19da.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/181-1e5338dd-92e8-4065-917a-2fe1d4c584f2.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/181-1e5338dd-92e8-4065-917a-2fe1d4c584f2.txn new file mode 100644 index 0000000000000000000000000000000000000000..98a545ab7a4dabb2943be026376ee660c570d6ea Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/181-1e5338dd-92e8-4065-917a-2fe1d4c584f2.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/182-2709751f-c0f5-4467-8744-b7bbac8676e5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/182-2709751f-c0f5-4467-8744-b7bbac8676e5.txn new file mode 100644 index 0000000000000000000000000000000000000000..403c5ae1c560fb37907a333ae0fe3341f581b1ca Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/182-2709751f-c0f5-4467-8744-b7bbac8676e5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/183-1cfd8dfc-c8f4-493e-9815-9f70eee4526f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/183-1cfd8dfc-c8f4-493e-9815-9f70eee4526f.txn new file mode 100644 index 0000000000000000000000000000000000000000..3e5fce924178b4d8cde7e51f080a5c20967114b3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/183-1cfd8dfc-c8f4-493e-9815-9f70eee4526f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/184-1b60df5c-fb09-41ce-93a3-b05c1aace1d5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/184-1b60df5c-fb09-41ce-93a3-b05c1aace1d5.txn new file mode 100644 index 0000000000000000000000000000000000000000..8807bce0e5e4ba94075d676cbb11d068e85846d6 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/184-1b60df5c-fb09-41ce-93a3-b05c1aace1d5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/185-91053738-bcb0-424e-9032-619d8633f977.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/185-91053738-bcb0-424e-9032-619d8633f977.txn new file mode 100644 index 0000000000000000000000000000000000000000..7dce2f7bfc70a0bf4faa7cc9b793e63c0dbc791e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/185-91053738-bcb0-424e-9032-619d8633f977.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/186-48081f09-ed97-42d7-865a-c71211d7cd3b.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/186-48081f09-ed97-42d7-865a-c71211d7cd3b.txn new file mode 100644 index 0000000000000000000000000000000000000000..101d9f25e37a9ce5628e49060e18f4516e7076f8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/186-48081f09-ed97-42d7-865a-c71211d7cd3b.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/187-619d3b5b-961f-464d-b832-971b1d6a5111.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/187-619d3b5b-961f-464d-b832-971b1d6a5111.txn new file mode 100644 index 0000000000000000000000000000000000000000..b66b647a00261913e4178b2a5261ecf3ad445309 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/187-619d3b5b-961f-464d-b832-971b1d6a5111.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/188-09b06e05-0c55-4a25-b0ad-87356ae9db8e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/188-09b06e05-0c55-4a25-b0ad-87356ae9db8e.txn new file mode 100644 index 0000000000000000000000000000000000000000..6c06baafc850325af4e1e8401e673a727e196c92 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/188-09b06e05-0c55-4a25-b0ad-87356ae9db8e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/189-63206dc7-84f2-40e9-a9d5-5b2717d2446a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/189-63206dc7-84f2-40e9-a9d5-5b2717d2446a.txn new file mode 100644 index 0000000000000000000000000000000000000000..c6c155439213a5c3eeb45d64f67cd4f276705161 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/189-63206dc7-84f2-40e9-a9d5-5b2717d2446a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/19-37955869-f47a-43b8-9ff3-0766d11e1fa5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/19-37955869-f47a-43b8-9ff3-0766d11e1fa5.txn new file mode 100644 index 0000000000000000000000000000000000000000..19dfea027bb4fdbda9f2b850aca7eb59dc362ac6 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/19-37955869-f47a-43b8-9ff3-0766d11e1fa5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/190-1671e7bb-0925-4483-9974-8b3f15177ec7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/190-1671e7bb-0925-4483-9974-8b3f15177ec7.txn new file mode 100644 index 0000000000000000000000000000000000000000..cfb9d14739338e3452e4aeaea3256410b58e2e89 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/190-1671e7bb-0925-4483-9974-8b3f15177ec7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/191-992c910b-cb88-43aa-b0be-3705dc044c7d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/191-992c910b-cb88-43aa-b0be-3705dc044c7d.txn new file mode 100644 index 0000000000000000000000000000000000000000..0b47fd736ab256c38622b367588face745432ff0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/191-992c910b-cb88-43aa-b0be-3705dc044c7d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/192-a2a58316-dc14-4840-8049-2cfeb689deb1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/192-a2a58316-dc14-4840-8049-2cfeb689deb1.txn new file mode 100644 index 0000000000000000000000000000000000000000..f335ee0c1dc200da4558011a64fef5d29bb32f81 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/192-a2a58316-dc14-4840-8049-2cfeb689deb1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/193-760f1c0e-483d-4637-96ea-a75919c83a1d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/193-760f1c0e-483d-4637-96ea-a75919c83a1d.txn new file mode 100644 index 0000000000000000000000000000000000000000..e3cb888de4133f317490461c5ec54a9f0ba6ff50 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/193-760f1c0e-483d-4637-96ea-a75919c83a1d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/194-d296570a-3a0d-478e-86f1-421c3fc00fe1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/194-d296570a-3a0d-478e-86f1-421c3fc00fe1.txn new file mode 100644 index 0000000000000000000000000000000000000000..01a92b46d93e8eebdcdd2ba06679c005e9723e4f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/194-d296570a-3a0d-478e-86f1-421c3fc00fe1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/195-efbc6454-1b0d-410a-a234-709a29ade1f5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/195-efbc6454-1b0d-410a-a234-709a29ade1f5.txn new file mode 100644 index 0000000000000000000000000000000000000000..4fb3f496cb23315c9a2c68f842f7b27153dc325d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/195-efbc6454-1b0d-410a-a234-709a29ade1f5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/196-021dbb1a-fc1b-4473-a986-602476f4feae.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/196-021dbb1a-fc1b-4473-a986-602476f4feae.txn new file mode 100644 index 0000000000000000000000000000000000000000..6a674e5b3dc9a94e034299d1eb27b9699240d2d9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/196-021dbb1a-fc1b-4473-a986-602476f4feae.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/197-adc25078-16b5-4042-8bbc-b8345725a3d9.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/197-adc25078-16b5-4042-8bbc-b8345725a3d9.txn new file mode 100644 index 0000000000000000000000000000000000000000..3df7f5453daf0873d728c7407607b33b88ad3b28 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/197-adc25078-16b5-4042-8bbc-b8345725a3d9.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/198-1ca523e1-e77f-4120-94c5-089a2045aa58.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/198-1ca523e1-e77f-4120-94c5-089a2045aa58.txn new file mode 100644 index 0000000000000000000000000000000000000000..9f8c61b636df72716a3312aea3234b122828212e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/198-1ca523e1-e77f-4120-94c5-089a2045aa58.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/199-f0b4471a-e483-4c0f-ad8c-252515f2355c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/199-f0b4471a-e483-4c0f-ad8c-252515f2355c.txn new file mode 100644 index 0000000000000000000000000000000000000000..9548ba36eb2b601c2212f105c69b6dab37fa570b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/199-f0b4471a-e483-4c0f-ad8c-252515f2355c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/2-f6766165-11ab-498c-8ad5-71a82a0f76e2.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/2-f6766165-11ab-498c-8ad5-71a82a0f76e2.txn new file mode 100644 index 0000000000000000000000000000000000000000..440675cd1984856440ff5f3c00149cadccf7e1fe Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/2-f6766165-11ab-498c-8ad5-71a82a0f76e2.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/20-82ccfc36-5437-42d7-abd6-9702b1c58768.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/20-82ccfc36-5437-42d7-abd6-9702b1c58768.txn new file mode 100644 index 0000000000000000000000000000000000000000..6e16e69e7ce71717c91891c8007e30e97a1747b3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/20-82ccfc36-5437-42d7-abd6-9702b1c58768.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/200-6d7bb2a6-13d8-4803-8883-8ba25322d329.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/200-6d7bb2a6-13d8-4803-8883-8ba25322d329.txn new file mode 100644 index 0000000000000000000000000000000000000000..dd4f60b6cb1f51efd9ed4b624b0d77574ce4feb4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/200-6d7bb2a6-13d8-4803-8883-8ba25322d329.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/201-4f0f1e5b-bb64-4963-8b05-d5a5f64540d1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/201-4f0f1e5b-bb64-4963-8b05-d5a5f64540d1.txn new file mode 100644 index 0000000000000000000000000000000000000000..1b5ae988ec7ba9b62e5a431c96abbb3a6c2930ef Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/201-4f0f1e5b-bb64-4963-8b05-d5a5f64540d1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/202-81a7e921-acd5-4334-b4a7-cb787e6efa8c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/202-81a7e921-acd5-4334-b4a7-cb787e6efa8c.txn new file mode 100644 index 0000000000000000000000000000000000000000..df850ac08a297dd37345a5558f144a9d83a18c48 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/202-81a7e921-acd5-4334-b4a7-cb787e6efa8c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/203-543fb3be-05e4-476c-98df-2dd698c51e44.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/203-543fb3be-05e4-476c-98df-2dd698c51e44.txn new file mode 100644 index 0000000000000000000000000000000000000000..f52a2b33690c91892551a0b708c2c39c852d3dbb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/203-543fb3be-05e4-476c-98df-2dd698c51e44.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/204-b84e1368-55a1-41ce-b4ce-f502944bd407.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/204-b84e1368-55a1-41ce-b4ce-f502944bd407.txn new file mode 100644 index 0000000000000000000000000000000000000000..a0bfed02e3187f7243499412330069a54e160709 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/204-b84e1368-55a1-41ce-b4ce-f502944bd407.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/205-5ba63bda-2519-4ed4-bbc1-df7fb92d55e0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/205-5ba63bda-2519-4ed4-bbc1-df7fb92d55e0.txn new file mode 100644 index 0000000000000000000000000000000000000000..533ebc5e4906fca7037b9b8f32511cd4c7e0e672 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/205-5ba63bda-2519-4ed4-bbc1-df7fb92d55e0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/206-4d8936b5-f9c6-473c-8c47-f6c98eaf20ba.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/206-4d8936b5-f9c6-473c-8c47-f6c98eaf20ba.txn new file mode 100644 index 0000000000000000000000000000000000000000..0e93762b96141f6b92cb4c734f93caa0c2cae9fd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/206-4d8936b5-f9c6-473c-8c47-f6c98eaf20ba.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/207-5fbe1451-d4e9-4451-9b88-4dce1a8167c5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/207-5fbe1451-d4e9-4451-9b88-4dce1a8167c5.txn new file mode 100644 index 0000000000000000000000000000000000000000..cefde043cb2a824b60c344af0294aa7a75fb01ed Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/207-5fbe1451-d4e9-4451-9b88-4dce1a8167c5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/208-92f5cc0c-a543-4f67-82e7-af37f3058ae6.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/208-92f5cc0c-a543-4f67-82e7-af37f3058ae6.txn new file mode 100644 index 0000000000000000000000000000000000000000..5fdcc9c9cecc5c5b8a62b553643477ccd501f502 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/208-92f5cc0c-a543-4f67-82e7-af37f3058ae6.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/209-dc1e1990-faa1-4de4-8156-fb8e1ecb9f24.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/209-dc1e1990-faa1-4de4-8156-fb8e1ecb9f24.txn new file mode 100644 index 0000000000000000000000000000000000000000..d7c994c06bb7665c7c935c1fd1067ff70e3e358d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/209-dc1e1990-faa1-4de4-8156-fb8e1ecb9f24.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/21-a07afd73-ee2f-4010-9e81-5c180248095b.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/21-a07afd73-ee2f-4010-9e81-5c180248095b.txn new file mode 100644 index 0000000000000000000000000000000000000000..6233b71cccb511111ded9f9e00a6cf34d0faf3ff Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/21-a07afd73-ee2f-4010-9e81-5c180248095b.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/210-97ba29a0-aabd-4e78-b626-d62861246efd.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/210-97ba29a0-aabd-4e78-b626-d62861246efd.txn new file mode 100644 index 0000000000000000000000000000000000000000..de6d80605c9bbb7d4611d016ccdcaa30fdd4f916 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/210-97ba29a0-aabd-4e78-b626-d62861246efd.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/211-f3f53032-29f7-4d65-9202-5da136787de2.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/211-f3f53032-29f7-4d65-9202-5da136787de2.txn new file mode 100644 index 0000000000000000000000000000000000000000..df6425cf4c19dce48f11d23da084b789fb05c138 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/211-f3f53032-29f7-4d65-9202-5da136787de2.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/212-0abcd489-d0b4-415f-aebd-08e5ae6f1073.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/212-0abcd489-d0b4-415f-aebd-08e5ae6f1073.txn new file mode 100644 index 0000000000000000000000000000000000000000..4ff00d4a1afd64cb7f094855068001b6533e2b84 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/212-0abcd489-d0b4-415f-aebd-08e5ae6f1073.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/213-606bb963-3aaf-449e-a8a5-9c0c8307ffb9.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/213-606bb963-3aaf-449e-a8a5-9c0c8307ffb9.txn new file mode 100644 index 0000000000000000000000000000000000000000..9d96f94d5132dc84bf787e4f3df7182404cb09e2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/213-606bb963-3aaf-449e-a8a5-9c0c8307ffb9.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/214-a1b7ef4d-3b21-4e5b-be7b-c588a6768154.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/214-a1b7ef4d-3b21-4e5b-be7b-c588a6768154.txn new file mode 100644 index 0000000000000000000000000000000000000000..a42936fca8ad79f7766c9c62345978ba7b18c2d1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/214-a1b7ef4d-3b21-4e5b-be7b-c588a6768154.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/215-2c9ae4da-2c64-4008-b47b-dbf40ada571b.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/215-2c9ae4da-2c64-4008-b47b-dbf40ada571b.txn new file mode 100644 index 0000000000000000000000000000000000000000..f8137c45aa9b439c223a016878085890797e7201 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/215-2c9ae4da-2c64-4008-b47b-dbf40ada571b.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/216-17fb96e6-241d-4b0b-8a94-cc20202696fb.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/216-17fb96e6-241d-4b0b-8a94-cc20202696fb.txn new file mode 100644 index 0000000000000000000000000000000000000000..5dc95d13492b7c2393ac95bf26f4622ffbda1ab2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/216-17fb96e6-241d-4b0b-8a94-cc20202696fb.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/217-2d4af277-cdac-4f44-875b-b66ec4ea3763.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/217-2d4af277-cdac-4f44-875b-b66ec4ea3763.txn new file mode 100644 index 0000000000000000000000000000000000000000..b0918ac8056a664e55f18014612a2f1095a77942 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/217-2d4af277-cdac-4f44-875b-b66ec4ea3763.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/218-33485294-6fb8-4c19-a35e-76312ec0cbd9.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/218-33485294-6fb8-4c19-a35e-76312ec0cbd9.txn new file mode 100644 index 0000000000000000000000000000000000000000..45e2e9819374776fe96e90d39d2e6904bbd94102 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/218-33485294-6fb8-4c19-a35e-76312ec0cbd9.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/219-8cb12d01-4873-428e-9504-ecaf9f496190.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/219-8cb12d01-4873-428e-9504-ecaf9f496190.txn new file mode 100644 index 0000000000000000000000000000000000000000..b5066b30a5c958f87f2980b2c8e8b1c883a7c292 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/219-8cb12d01-4873-428e-9504-ecaf9f496190.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/22-0e66f422-e4dd-4269-85b3-6789c4926610.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/22-0e66f422-e4dd-4269-85b3-6789c4926610.txn new file mode 100644 index 0000000000000000000000000000000000000000..f0a82f4c5f560b4ee0ea3c3238b9c893c9c8faf9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/22-0e66f422-e4dd-4269-85b3-6789c4926610.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/220-dd6b963a-c7ad-46e4-ba9c-6d79c5cbaf5f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/220-dd6b963a-c7ad-46e4-ba9c-6d79c5cbaf5f.txn new file mode 100644 index 0000000000000000000000000000000000000000..12e260201b339dff224a16c74aae33fc89b2f37e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/220-dd6b963a-c7ad-46e4-ba9c-6d79c5cbaf5f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/221-449b987e-dcd1-4b67-aeaf-7e83f946be21.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/221-449b987e-dcd1-4b67-aeaf-7e83f946be21.txn new file mode 100644 index 0000000000000000000000000000000000000000..1c2538e931da5e5f19733f47f1a11912c5cd1974 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/221-449b987e-dcd1-4b67-aeaf-7e83f946be21.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/222-2d0f4e98-55f6-411a-934f-21f23bd56c87.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/222-2d0f4e98-55f6-411a-934f-21f23bd56c87.txn new file mode 100644 index 0000000000000000000000000000000000000000..7703f633b63fa91b0d7212a5e7c7e76f66b87aef Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/222-2d0f4e98-55f6-411a-934f-21f23bd56c87.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/223-881cf57d-b882-4bf0-bb1e-06f4eeca02ef.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/223-881cf57d-b882-4bf0-bb1e-06f4eeca02ef.txn new file mode 100644 index 0000000000000000000000000000000000000000..a015f4965043ff38824301086cefc1fddc9717bb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/223-881cf57d-b882-4bf0-bb1e-06f4eeca02ef.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/224-22b0d954-b6a9-4c69-9ba6-636cbdbc1eb0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/224-22b0d954-b6a9-4c69-9ba6-636cbdbc1eb0.txn new file mode 100644 index 0000000000000000000000000000000000000000..6d3f99290aebb5d84c66bfb38346ba0150753e8e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/224-22b0d954-b6a9-4c69-9ba6-636cbdbc1eb0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/225-b5a6a282-11e6-4257-8a3c-8e43162a8356.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/225-b5a6a282-11e6-4257-8a3c-8e43162a8356.txn new file mode 100644 index 0000000000000000000000000000000000000000..d66909d055984ffacc961b231a29ef5b12dd6620 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/225-b5a6a282-11e6-4257-8a3c-8e43162a8356.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/226-3f894192-6f2b-4229-bcb3-cff442994524.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/226-3f894192-6f2b-4229-bcb3-cff442994524.txn new file mode 100644 index 0000000000000000000000000000000000000000..55fd0c078b90de55197bfdc7550ea7067f4b8f12 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/226-3f894192-6f2b-4229-bcb3-cff442994524.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/227-a4aa5151-8b8c-467a-8a95-caf0c649d721.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/227-a4aa5151-8b8c-467a-8a95-caf0c649d721.txn new file mode 100644 index 0000000000000000000000000000000000000000..24c4c65d24cc8e43ea51577c0cf23248721c5996 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/227-a4aa5151-8b8c-467a-8a95-caf0c649d721.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/228-fc80f592-68b7-4668-bae0-a65e4d7327cf.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/228-fc80f592-68b7-4668-bae0-a65e4d7327cf.txn new file mode 100644 index 0000000000000000000000000000000000000000..ae9f8834ca804193d2e72791743d8e779aec8eae Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/228-fc80f592-68b7-4668-bae0-a65e4d7327cf.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/229-cad0fcdc-8784-4ef3-8730-b2794479b232.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/229-cad0fcdc-8784-4ef3-8730-b2794479b232.txn new file mode 100644 index 0000000000000000000000000000000000000000..503d3a31077ba1d22b295eb1eace320184dd6234 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/229-cad0fcdc-8784-4ef3-8730-b2794479b232.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/23-4ba63a24-3a7e-470f-bbdb-f3cda4b096ee.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/23-4ba63a24-3a7e-470f-bbdb-f3cda4b096ee.txn new file mode 100644 index 0000000000000000000000000000000000000000..59320c05530c5af1c5bb920de72f3a27f06a7cd0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/23-4ba63a24-3a7e-470f-bbdb-f3cda4b096ee.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/230-01e638d2-10bd-4336-94b2-75b4a19f8b33.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/230-01e638d2-10bd-4336-94b2-75b4a19f8b33.txn new file mode 100644 index 0000000000000000000000000000000000000000..18887dc95e48bc18b0103852977a959f5696f44c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/230-01e638d2-10bd-4336-94b2-75b4a19f8b33.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/231-0ca52b5d-adaa-4367-be8f-a4eb64a255b1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/231-0ca52b5d-adaa-4367-be8f-a4eb64a255b1.txn new file mode 100644 index 0000000000000000000000000000000000000000..5f507d4505e63e7e6129da333ca9219451a95e8d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/231-0ca52b5d-adaa-4367-be8f-a4eb64a255b1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/232-06c73e28-c8ed-4245-8696-8e4b744dda87.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/232-06c73e28-c8ed-4245-8696-8e4b744dda87.txn new file mode 100644 index 0000000000000000000000000000000000000000..734e58c5d0969eca695073aebc9c0f4026553472 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/232-06c73e28-c8ed-4245-8696-8e4b744dda87.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/233-9c315920-3342-48d9-a63d-87b106290169.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/233-9c315920-3342-48d9-a63d-87b106290169.txn new file mode 100644 index 0000000000000000000000000000000000000000..f0ed54bf7d92c1c3e76a9a8bd84a5445b136f77d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/233-9c315920-3342-48d9-a63d-87b106290169.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/234-e9458c13-02ba-45cf-bf04-17caa3a49ccc.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/234-e9458c13-02ba-45cf-bf04-17caa3a49ccc.txn new file mode 100644 index 0000000000000000000000000000000000000000..0f8717d891418d244f6d7fae3a54d0a6eb2d1671 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/234-e9458c13-02ba-45cf-bf04-17caa3a49ccc.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/235-84b0ea8c-f165-4d21-8616-a4c8c3bf064c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/235-84b0ea8c-f165-4d21-8616-a4c8c3bf064c.txn new file mode 100644 index 0000000000000000000000000000000000000000..b9c23ace399d5c68ccd16ea8a8cd06aa69ebb9e7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/235-84b0ea8c-f165-4d21-8616-a4c8c3bf064c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/236-5af77356-5f69-4168-bc87-5420c5a0ca0d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/236-5af77356-5f69-4168-bc87-5420c5a0ca0d.txn new file mode 100644 index 0000000000000000000000000000000000000000..e31f6f03e3b2e57eaca281c0dbcd436db19c6682 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/236-5af77356-5f69-4168-bc87-5420c5a0ca0d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/237-5122f2b2-bc9f-401d-803f-00cfcc48f799.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/237-5122f2b2-bc9f-401d-803f-00cfcc48f799.txn new file mode 100644 index 0000000000000000000000000000000000000000..4220dcdbd4de4d37ca8ea20d0c010aa792af3aa4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/237-5122f2b2-bc9f-401d-803f-00cfcc48f799.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/238-bcf34a25-1cba-4dfd-b2a8-d788eca45890.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/238-bcf34a25-1cba-4dfd-b2a8-d788eca45890.txn new file mode 100644 index 0000000000000000000000000000000000000000..0b758dce793d4ac49cfcefcbfc41375ca7450815 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/238-bcf34a25-1cba-4dfd-b2a8-d788eca45890.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/239-78a18182-a815-4b66-a404-7e84be7dbd38.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/239-78a18182-a815-4b66-a404-7e84be7dbd38.txn new file mode 100644 index 0000000000000000000000000000000000000000..289b8dd2459d91c9986458bed679400c0059d9d2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/239-78a18182-a815-4b66-a404-7e84be7dbd38.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/24-7daa309a-b59c-4ade-a5a4-726db9cc782a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/24-7daa309a-b59c-4ade-a5a4-726db9cc782a.txn new file mode 100644 index 0000000000000000000000000000000000000000..39aa0b05c97bfaebe8ffcb3c41d16c9688191a71 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/24-7daa309a-b59c-4ade-a5a4-726db9cc782a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/240-c1ef8a35-cca2-40c6-889b-c03c0735e3bf.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/240-c1ef8a35-cca2-40c6-889b-c03c0735e3bf.txn new file mode 100644 index 0000000000000000000000000000000000000000..c2d1c7c48bfa70b150dc65ae5afdda94203fb935 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/240-c1ef8a35-cca2-40c6-889b-c03c0735e3bf.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/241-d4fa0071-bc1f-4d6b-8b34-751057ddde23.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/241-d4fa0071-bc1f-4d6b-8b34-751057ddde23.txn new file mode 100644 index 0000000000000000000000000000000000000000..5470ea3669abe7160ae4075cf9acea1450b37b03 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/241-d4fa0071-bc1f-4d6b-8b34-751057ddde23.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/242-26d99242-232f-4e97-9196-3877ebf98de8.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/242-26d99242-232f-4e97-9196-3877ebf98de8.txn new file mode 100644 index 0000000000000000000000000000000000000000..a869ad9939b451d5d5d90940b11eb783c2cdc2cc Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/242-26d99242-232f-4e97-9196-3877ebf98de8.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/243-58348c59-89a0-4e82-a7e0-faa60f104759.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/243-58348c59-89a0-4e82-a7e0-faa60f104759.txn new file mode 100644 index 0000000000000000000000000000000000000000..61730dfdb9ddcca86efbbf253d17916168f87478 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/243-58348c59-89a0-4e82-a7e0-faa60f104759.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/244-8944e89d-24bf-4611-95dd-d5b4dce7f5f9.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/244-8944e89d-24bf-4611-95dd-d5b4dce7f5f9.txn new file mode 100644 index 0000000000000000000000000000000000000000..13d53a72831dd388fe243deb8b858317040dd2c1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/244-8944e89d-24bf-4611-95dd-d5b4dce7f5f9.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/245-e2437f9e-5382-4a85-be8c-7131204f3ce7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/245-e2437f9e-5382-4a85-be8c-7131204f3ce7.txn new file mode 100644 index 0000000000000000000000000000000000000000..cd4c54176cd3533a27f252acd7512c153b95f565 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/245-e2437f9e-5382-4a85-be8c-7131204f3ce7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/246-32a1b5b3-00e6-4cdd-9f90-72d2b387566f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/246-32a1b5b3-00e6-4cdd-9f90-72d2b387566f.txn new file mode 100644 index 0000000000000000000000000000000000000000..815720e797c72cd61e1cba70d39c44b13a1cd29c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/246-32a1b5b3-00e6-4cdd-9f90-72d2b387566f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/247-399f1b07-5fef-4529-9ed6-7a81e29fd355.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/247-399f1b07-5fef-4529-9ed6-7a81e29fd355.txn new file mode 100644 index 0000000000000000000000000000000000000000..1988ad52557ebe7b148a7cc0164e747b9cbee51a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/247-399f1b07-5fef-4529-9ed6-7a81e29fd355.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/248-697d5035-0aa6-4a02-b6c4-7b8a93766e43.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/248-697d5035-0aa6-4a02-b6c4-7b8a93766e43.txn new file mode 100644 index 0000000000000000000000000000000000000000..ab6dddff20259dd548e983cc719d34084c7790e5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/248-697d5035-0aa6-4a02-b6c4-7b8a93766e43.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/249-e62d55a1-0ed4-446a-8896-78a48ab7ca30.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/249-e62d55a1-0ed4-446a-8896-78a48ab7ca30.txn new file mode 100644 index 0000000000000000000000000000000000000000..5c0d24ce12e69f927873b6e304c4ecf20a29e505 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/249-e62d55a1-0ed4-446a-8896-78a48ab7ca30.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/25-a9cffc7f-0be2-4e7a-8b15-2a7da164c614.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/25-a9cffc7f-0be2-4e7a-8b15-2a7da164c614.txn new file mode 100644 index 0000000000000000000000000000000000000000..e2d4f900b59cbaf9bb30c07fad564bdeeb62006f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/25-a9cffc7f-0be2-4e7a-8b15-2a7da164c614.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/250-6a649477-20c1-4a99-851f-68c2a4f983e5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/250-6a649477-20c1-4a99-851f-68c2a4f983e5.txn new file mode 100644 index 0000000000000000000000000000000000000000..c4bed9980979db7e7625879a5e2f60300261ffc5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/250-6a649477-20c1-4a99-851f-68c2a4f983e5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/251-ae89744c-99c4-4a87-bf24-68db86845e42.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/251-ae89744c-99c4-4a87-bf24-68db86845e42.txn new file mode 100644 index 0000000000000000000000000000000000000000..119ce14654cd0c970b70f779e4901b7449633bd7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/251-ae89744c-99c4-4a87-bf24-68db86845e42.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/252-51edddec-7f5f-41ed-9374-58daa81147bb.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/252-51edddec-7f5f-41ed-9374-58daa81147bb.txn new file mode 100644 index 0000000000000000000000000000000000000000..e5550407d291b7512073b3500986a59a6b8f929d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/252-51edddec-7f5f-41ed-9374-58daa81147bb.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/253-b5f12b4f-7f91-42d2-85a3-f6d81f8c2d55.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/253-b5f12b4f-7f91-42d2-85a3-f6d81f8c2d55.txn new file mode 100644 index 0000000000000000000000000000000000000000..e86fee8119983296ecae3e13d88b23e3044461fa Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/253-b5f12b4f-7f91-42d2-85a3-f6d81f8c2d55.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/254-7414e777-72d1-4adb-bb44-1cf14665f319.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/254-7414e777-72d1-4adb-bb44-1cf14665f319.txn new file mode 100644 index 0000000000000000000000000000000000000000..b4ac890f4ed8963153c07dab559478a9fc144818 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/254-7414e777-72d1-4adb-bb44-1cf14665f319.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/255-117affe6-7d17-4bdd-9c12-016300a1390a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/255-117affe6-7d17-4bdd-9c12-016300a1390a.txn new file mode 100644 index 0000000000000000000000000000000000000000..6cb54403ce297b9e2637ea13606f1fa6d7eea9f8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/255-117affe6-7d17-4bdd-9c12-016300a1390a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/256-abfd9b5b-0090-402b-a338-557914ebb79b.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/256-abfd9b5b-0090-402b-a338-557914ebb79b.txn new file mode 100644 index 0000000000000000000000000000000000000000..a01d5808576583aab1bd032fec4b0aa9eb9c4bf5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/256-abfd9b5b-0090-402b-a338-557914ebb79b.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/257-c72683bb-5dee-4e39-8617-9f0c3e2998f2.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/257-c72683bb-5dee-4e39-8617-9f0c3e2998f2.txn new file mode 100644 index 0000000000000000000000000000000000000000..466e7ef0e3b8cd84104b61b85db9a0af2bcaa80a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/257-c72683bb-5dee-4e39-8617-9f0c3e2998f2.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/258-7b4b63c7-3424-426a-b29b-ec47c6a1f5ef.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/258-7b4b63c7-3424-426a-b29b-ec47c6a1f5ef.txn new file mode 100644 index 0000000000000000000000000000000000000000..bdb8371c291a75ac8907099b50b4d5f3603cdb8c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/258-7b4b63c7-3424-426a-b29b-ec47c6a1f5ef.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/259-a47057bd-c9d6-4bff-bc92-a0fedd14d702.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/259-a47057bd-c9d6-4bff-bc92-a0fedd14d702.txn new file mode 100644 index 0000000000000000000000000000000000000000..4d801a9eb324fa09a4c7577b2f2544d3566188a2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/259-a47057bd-c9d6-4bff-bc92-a0fedd14d702.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/26-61137874-e5dd-4f9c-afbd-f320a76fac6c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/26-61137874-e5dd-4f9c-afbd-f320a76fac6c.txn new file mode 100644 index 0000000000000000000000000000000000000000..ad7c35e99a913dc9c6dc8616c823dc5371d8bb3c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/26-61137874-e5dd-4f9c-afbd-f320a76fac6c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/260-c425e294-8b2f-465a-935a-7b33a90572ad.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/260-c425e294-8b2f-465a-935a-7b33a90572ad.txn new file mode 100644 index 0000000000000000000000000000000000000000..d4d3870db0acee8867fb529c3dcd82eed972ee23 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/260-c425e294-8b2f-465a-935a-7b33a90572ad.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/261-bc72a548-382f-456d-9d02-7c84e83ccb96.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/261-bc72a548-382f-456d-9d02-7c84e83ccb96.txn new file mode 100644 index 0000000000000000000000000000000000000000..75cadffe84335bc30cac209d278efaf0d85661a7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/261-bc72a548-382f-456d-9d02-7c84e83ccb96.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/262-9019b627-7074-4a0a-bd51-a3a7b3818714.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/262-9019b627-7074-4a0a-bd51-a3a7b3818714.txn new file mode 100644 index 0000000000000000000000000000000000000000..f6e4bbe6d61bf5482d8b8e334975b0f5f7d87b2d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/262-9019b627-7074-4a0a-bd51-a3a7b3818714.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/263-fc970eb7-5112-42b0-a96f-19b997db0555.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/263-fc970eb7-5112-42b0-a96f-19b997db0555.txn new file mode 100644 index 0000000000000000000000000000000000000000..25b069978941bf9e7cabb05163d6ed6350140fab Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/263-fc970eb7-5112-42b0-a96f-19b997db0555.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/264-22146d77-eee2-4b5a-b8e6-7f814c3d2425.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/264-22146d77-eee2-4b5a-b8e6-7f814c3d2425.txn new file mode 100644 index 0000000000000000000000000000000000000000..75001d2e386076d1d392686962ecfcdec8636040 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/264-22146d77-eee2-4b5a-b8e6-7f814c3d2425.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/265-24cd005c-ec72-4591-95e2-190c51987c45.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/265-24cd005c-ec72-4591-95e2-190c51987c45.txn new file mode 100644 index 0000000000000000000000000000000000000000..e8c657802027fa2dda0a26861c61ecfd44391d85 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/265-24cd005c-ec72-4591-95e2-190c51987c45.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/266-53c05ff6-d5e4-4f51-bab0-807bbbed806b.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/266-53c05ff6-d5e4-4f51-bab0-807bbbed806b.txn new file mode 100644 index 0000000000000000000000000000000000000000..4c9650c149bf040a691e1dbdba0df8ea7139b0c0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/266-53c05ff6-d5e4-4f51-bab0-807bbbed806b.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/267-a810ffda-c9b8-4a68-8939-997f8009b086.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/267-a810ffda-c9b8-4a68-8939-997f8009b086.txn new file mode 100644 index 0000000000000000000000000000000000000000..410b0d63be676a0c955cdd3e8e6bd76a4cf06ba2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/267-a810ffda-c9b8-4a68-8939-997f8009b086.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/268-25a076c4-bddf-4326-9895-9f01f9feed78.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/268-25a076c4-bddf-4326-9895-9f01f9feed78.txn new file mode 100644 index 0000000000000000000000000000000000000000..d0093bafa97aace0afc2a579e6be70cd53740e88 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/268-25a076c4-bddf-4326-9895-9f01f9feed78.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/269-4ea5d5b3-e8a5-4ef7-9ee6-7921c8c3d665.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/269-4ea5d5b3-e8a5-4ef7-9ee6-7921c8c3d665.txn new file mode 100644 index 0000000000000000000000000000000000000000..13dc6b4834560331480d7d2f42a39191cbfbfa96 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/269-4ea5d5b3-e8a5-4ef7-9ee6-7921c8c3d665.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/27-74ba64a4-ad79-4709-a24c-f0cb458818c6.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/27-74ba64a4-ad79-4709-a24c-f0cb458818c6.txn new file mode 100644 index 0000000000000000000000000000000000000000..f9e200b7d73725b56b886b18f9fa7cfb7f7a9569 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/27-74ba64a4-ad79-4709-a24c-f0cb458818c6.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/270-9cc7753f-ad83-48d6-babd-10b8d582f194.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/270-9cc7753f-ad83-48d6-babd-10b8d582f194.txn new file mode 100644 index 0000000000000000000000000000000000000000..2f4c8bf30014e9579fe60c2e9f674a4555d77014 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/270-9cc7753f-ad83-48d6-babd-10b8d582f194.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/271-5732a202-ff15-4991-a95a-931b49100df4.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/271-5732a202-ff15-4991-a95a-931b49100df4.txn new file mode 100644 index 0000000000000000000000000000000000000000..6edfa7a04478501c97033c1bfec144ae5bec085c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/271-5732a202-ff15-4991-a95a-931b49100df4.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/272-bcc2eb02-bf16-46fc-8efd-ef5a86a6c932.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/272-bcc2eb02-bf16-46fc-8efd-ef5a86a6c932.txn new file mode 100644 index 0000000000000000000000000000000000000000..b6809b35afb5a7099ac902c430923d786004b863 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/272-bcc2eb02-bf16-46fc-8efd-ef5a86a6c932.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/273-a866ff7c-3fe8-441a-84c4-598d36f4a787.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/273-a866ff7c-3fe8-441a-84c4-598d36f4a787.txn new file mode 100644 index 0000000000000000000000000000000000000000..173316ce8e99d8d476bb5fe672cb423ca203e0e3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/273-a866ff7c-3fe8-441a-84c4-598d36f4a787.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/274-5ef216f6-9d5f-4d3b-b86a-62115e8a4364.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/274-5ef216f6-9d5f-4d3b-b86a-62115e8a4364.txn new file mode 100644 index 0000000000000000000000000000000000000000..b54ad782dae038d8f9bca555e5882629ebc853f0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/274-5ef216f6-9d5f-4d3b-b86a-62115e8a4364.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/275-483930f5-5d47-44c0-a15e-141d7d318beb.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/275-483930f5-5d47-44c0-a15e-141d7d318beb.txn new file mode 100644 index 0000000000000000000000000000000000000000..fffa7d2df63748c44e5ab584300f8000160399d1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/275-483930f5-5d47-44c0-a15e-141d7d318beb.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/276-3726ca7c-91fd-4aec-9983-7601eb4d3cc0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/276-3726ca7c-91fd-4aec-9983-7601eb4d3cc0.txn new file mode 100644 index 0000000000000000000000000000000000000000..15b660839834e994e184fae825d813817e7bf07f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/276-3726ca7c-91fd-4aec-9983-7601eb4d3cc0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/277-8a5f6e73-4a25-4537-8ee7-83e6408fe178.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/277-8a5f6e73-4a25-4537-8ee7-83e6408fe178.txn new file mode 100644 index 0000000000000000000000000000000000000000..0ddd4b3a7d76791e0baebc492e45f1cc7693be68 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/277-8a5f6e73-4a25-4537-8ee7-83e6408fe178.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/278-30045935-9a55-4b36-9ca0-d9e735215305.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/278-30045935-9a55-4b36-9ca0-d9e735215305.txn new file mode 100644 index 0000000000000000000000000000000000000000..ac33ae7acfe037fea246ce8d246209cf4294fbe4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/278-30045935-9a55-4b36-9ca0-d9e735215305.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/279-8583a106-908d-4354-910e-0b5d5e2f9ef8.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/279-8583a106-908d-4354-910e-0b5d5e2f9ef8.txn new file mode 100644 index 0000000000000000000000000000000000000000..01715a30d3ab9b0e672c4d860254a6637d9929cb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/279-8583a106-908d-4354-910e-0b5d5e2f9ef8.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/28-ea13c2fb-1b61-45d4-8cd1-801f4a83001d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/28-ea13c2fb-1b61-45d4-8cd1-801f4a83001d.txn new file mode 100644 index 0000000000000000000000000000000000000000..f5699318e0b034eef4fd7ab00a4c999b4eb1ab29 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/28-ea13c2fb-1b61-45d4-8cd1-801f4a83001d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/280-455a344f-b41a-4ff2-ad18-96e9d071312b.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/280-455a344f-b41a-4ff2-ad18-96e9d071312b.txn new file mode 100644 index 0000000000000000000000000000000000000000..e86b9d0e0f4c290bac2924800a29bff8c51eeec3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/280-455a344f-b41a-4ff2-ad18-96e9d071312b.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/281-c0d83115-6933-48d9-aa9f-e7022cf593be.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/281-c0d83115-6933-48d9-aa9f-e7022cf593be.txn new file mode 100644 index 0000000000000000000000000000000000000000..c5662fe763b49b537f2a31cf40e60dc4f7f61398 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/281-c0d83115-6933-48d9-aa9f-e7022cf593be.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/282-d41cb5cf-cbf0-4572-9d53-a1aa2999dea7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/282-d41cb5cf-cbf0-4572-9d53-a1aa2999dea7.txn new file mode 100644 index 0000000000000000000000000000000000000000..d189a1622d487cbdc6d2c4b7c3168a567155df8a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/282-d41cb5cf-cbf0-4572-9d53-a1aa2999dea7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/283-26d05969-3b72-4dbc-9497-000c8f3cdcaf.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/283-26d05969-3b72-4dbc-9497-000c8f3cdcaf.txn new file mode 100644 index 0000000000000000000000000000000000000000..733f9688d9b7eee4266b41f6a08d3d37801d7185 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/283-26d05969-3b72-4dbc-9497-000c8f3cdcaf.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/284-229c742b-b4af-49d8-9fda-266e80fe38c5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/284-229c742b-b4af-49d8-9fda-266e80fe38c5.txn new file mode 100644 index 0000000000000000000000000000000000000000..ba9e672397590f20687a2ba22b1d4959b7275497 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/284-229c742b-b4af-49d8-9fda-266e80fe38c5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/285-cd78996a-d7ec-4b5c-835c-7c86d87cfb64.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/285-cd78996a-d7ec-4b5c-835c-7c86d87cfb64.txn new file mode 100644 index 0000000000000000000000000000000000000000..73b71251e1656912818b0521ac913e5209c249bb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/285-cd78996a-d7ec-4b5c-835c-7c86d87cfb64.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/286-7c7faaec-dcf6-4394-89fc-ee81f1359feb.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/286-7c7faaec-dcf6-4394-89fc-ee81f1359feb.txn new file mode 100644 index 0000000000000000000000000000000000000000..a27b297a7be985dcd286265ef0217b84496ed544 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/286-7c7faaec-dcf6-4394-89fc-ee81f1359feb.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/287-7854345b-8414-4f61-8886-20992a8688a5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/287-7854345b-8414-4f61-8886-20992a8688a5.txn new file mode 100644 index 0000000000000000000000000000000000000000..aa081fed06db52cc3b0b432ff537375fdc556c2f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/287-7854345b-8414-4f61-8886-20992a8688a5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/288-7148ef4b-5b79-4b5f-ba8b-28fc6ef3f436.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/288-7148ef4b-5b79-4b5f-ba8b-28fc6ef3f436.txn new file mode 100644 index 0000000000000000000000000000000000000000..e136a038680231cf3517ee549d4102b4fcf3786d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/288-7148ef4b-5b79-4b5f-ba8b-28fc6ef3f436.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/289-e5b5daac-f18a-4480-81dd-9e54d736db04.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/289-e5b5daac-f18a-4480-81dd-9e54d736db04.txn new file mode 100644 index 0000000000000000000000000000000000000000..25183ce2669ae4b001a24e56cd4fb66e2d95d5a4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/289-e5b5daac-f18a-4480-81dd-9e54d736db04.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/29-4a27d9a0-63f7-4a59-bafb-9ac5641c55cb.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/29-4a27d9a0-63f7-4a59-bafb-9ac5641c55cb.txn new file mode 100644 index 0000000000000000000000000000000000000000..8ca3993a042825610c9bdcf016f9d69a90a6d693 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/29-4a27d9a0-63f7-4a59-bafb-9ac5641c55cb.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/290-81f7b1c7-4bd8-4b7e-a870-b4dc683c2761.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/290-81f7b1c7-4bd8-4b7e-a870-b4dc683c2761.txn new file mode 100644 index 0000000000000000000000000000000000000000..ff0c4d0c0d0536431648da4492c3d8c8dbc5c97f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/290-81f7b1c7-4bd8-4b7e-a870-b4dc683c2761.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/291-19e90945-dc6c-41dd-8718-119feb948a2e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/291-19e90945-dc6c-41dd-8718-119feb948a2e.txn new file mode 100644 index 0000000000000000000000000000000000000000..040f6908973dd54650504d4e605b6487b8cfaa93 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/291-19e90945-dc6c-41dd-8718-119feb948a2e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/292-c4ec1437-1e27-4bad-aec2-03c69ac144f0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/292-c4ec1437-1e27-4bad-aec2-03c69ac144f0.txn new file mode 100644 index 0000000000000000000000000000000000000000..555a999380bf7263b4b37f4c3e4f54e387f1e5e7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/292-c4ec1437-1e27-4bad-aec2-03c69ac144f0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/293-30714722-27fa-4226-9e0d-59b179d46c19.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/293-30714722-27fa-4226-9e0d-59b179d46c19.txn new file mode 100644 index 0000000000000000000000000000000000000000..61d4221317520b575b2b0f2402731db56e6f7a24 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/293-30714722-27fa-4226-9e0d-59b179d46c19.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/294-7fcc3e06-a68c-47e1-950c-1b1586159c52.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/294-7fcc3e06-a68c-47e1-950c-1b1586159c52.txn new file mode 100644 index 0000000000000000000000000000000000000000..ec9f58c93d2272f91b7f8df0408abdc0e22ecaf9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/294-7fcc3e06-a68c-47e1-950c-1b1586159c52.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/295-4d86c8e1-29b8-4d89-b697-0b3e809891cb.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/295-4d86c8e1-29b8-4d89-b697-0b3e809891cb.txn new file mode 100644 index 0000000000000000000000000000000000000000..a382a6486d280f10aa3db788e8922286230c8436 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/295-4d86c8e1-29b8-4d89-b697-0b3e809891cb.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/296-0b7e87cb-093b-401a-b6a0-8ebb3ff251dd.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/296-0b7e87cb-093b-401a-b6a0-8ebb3ff251dd.txn new file mode 100644 index 0000000000000000000000000000000000000000..c70957db2754a4181ecab1c9d14c16d3c7fa0564 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/296-0b7e87cb-093b-401a-b6a0-8ebb3ff251dd.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/297-3cc393fb-43f8-4c71-aad5-34805ce4fd4e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/297-3cc393fb-43f8-4c71-aad5-34805ce4fd4e.txn new file mode 100644 index 0000000000000000000000000000000000000000..64ec55c13e3f0d6c51ae5cb52251ccfcdf50b394 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/297-3cc393fb-43f8-4c71-aad5-34805ce4fd4e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/298-c0dcc174-39dd-439c-a5ac-0c9853ce972f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/298-c0dcc174-39dd-439c-a5ac-0c9853ce972f.txn new file mode 100644 index 0000000000000000000000000000000000000000..bd8fb6ee7e2c4184c12a004476a8121bb31751b3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/298-c0dcc174-39dd-439c-a5ac-0c9853ce972f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/299-5784884e-67c9-4032-a7ff-058f2578f2cf.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/299-5784884e-67c9-4032-a7ff-058f2578f2cf.txn new file mode 100644 index 0000000000000000000000000000000000000000..6dbd3720be245d8175d9d215d78e33d334fa5704 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/299-5784884e-67c9-4032-a7ff-058f2578f2cf.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/3-cb2d6e15-84f8-4339-b0a5-ae73009711d7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/3-cb2d6e15-84f8-4339-b0a5-ae73009711d7.txn new file mode 100644 index 0000000000000000000000000000000000000000..2584b1593e651a8bef552080b1615ec0b9538e93 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/3-cb2d6e15-84f8-4339-b0a5-ae73009711d7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/30-1dd245ea-de2c-42cc-b8d7-04a2f4d5c7a2.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/30-1dd245ea-de2c-42cc-b8d7-04a2f4d5c7a2.txn new file mode 100644 index 0000000000000000000000000000000000000000..262fb86dcbef85c9c169183ef1ce0c0c55e774d1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/30-1dd245ea-de2c-42cc-b8d7-04a2f4d5c7a2.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/300-7b6441d8-ed15-4645-a9a2-6443b5ba867d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/300-7b6441d8-ed15-4645-a9a2-6443b5ba867d.txn new file mode 100644 index 0000000000000000000000000000000000000000..45412075de10a80e9d6269c8f79c8e7359f8b93b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/300-7b6441d8-ed15-4645-a9a2-6443b5ba867d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/301-6f832569-03eb-42fa-b254-c0e10adecac5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/301-6f832569-03eb-42fa-b254-c0e10adecac5.txn new file mode 100644 index 0000000000000000000000000000000000000000..5198bd7f8ec0696baa400bd83e28252a9081baaa Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/301-6f832569-03eb-42fa-b254-c0e10adecac5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/302-5d2307e1-1758-4dfa-8692-d46c910b309e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/302-5d2307e1-1758-4dfa-8692-d46c910b309e.txn new file mode 100644 index 0000000000000000000000000000000000000000..b7a21661ec4a4f563fc690135a169d59f9731221 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/302-5d2307e1-1758-4dfa-8692-d46c910b309e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/303-a4054f63-2203-40bb-a64c-8319f29807ed.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/303-a4054f63-2203-40bb-a64c-8319f29807ed.txn new file mode 100644 index 0000000000000000000000000000000000000000..bd4e2daef57b94734a3a20fa5d58fb39becfb4a7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/303-a4054f63-2203-40bb-a64c-8319f29807ed.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/304-e2abbec1-fae8-4bbc-b0b4-c1ad3d409035.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/304-e2abbec1-fae8-4bbc-b0b4-c1ad3d409035.txn new file mode 100644 index 0000000000000000000000000000000000000000..405716f49f01738587ee734ccb48b88b9d4d5257 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/304-e2abbec1-fae8-4bbc-b0b4-c1ad3d409035.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/305-7ee1ad6a-216f-4662-82a3-b15f7c912b16.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/305-7ee1ad6a-216f-4662-82a3-b15f7c912b16.txn new file mode 100644 index 0000000000000000000000000000000000000000..3b4da140501a5a5f364c2fa22958df310da3ca22 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/305-7ee1ad6a-216f-4662-82a3-b15f7c912b16.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/306-a9e28055-0e58-4af4-b3c0-9e1eae815de7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/306-a9e28055-0e58-4af4-b3c0-9e1eae815de7.txn new file mode 100644 index 0000000000000000000000000000000000000000..6e9b0415f5212e2fb0c8f99fb0ea0740feaa2fef Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/306-a9e28055-0e58-4af4-b3c0-9e1eae815de7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/307-097659cc-c82b-4dea-a9cd-7dbf0b7fdc85.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/307-097659cc-c82b-4dea-a9cd-7dbf0b7fdc85.txn new file mode 100644 index 0000000000000000000000000000000000000000..881193be57153450439a2bd53f848e75b1a5348b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/307-097659cc-c82b-4dea-a9cd-7dbf0b7fdc85.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/308-cb60cafd-f9a5-42e5-aadd-da324222da2f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/308-cb60cafd-f9a5-42e5-aadd-da324222da2f.txn new file mode 100644 index 0000000000000000000000000000000000000000..c9c7875a8cb578c4aebab7893b996ad77ba0f523 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/308-cb60cafd-f9a5-42e5-aadd-da324222da2f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/309-8051daa7-5c1d-4194-b99c-9ede32a676e6.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/309-8051daa7-5c1d-4194-b99c-9ede32a676e6.txn new file mode 100644 index 0000000000000000000000000000000000000000..38d4221a9144bbdb70fa7b1cdc004e0fd456eb7c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/309-8051daa7-5c1d-4194-b99c-9ede32a676e6.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/31-c52bbc6c-916b-4d0d-be92-9b7fc9eb2ba0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/31-c52bbc6c-916b-4d0d-be92-9b7fc9eb2ba0.txn new file mode 100644 index 0000000000000000000000000000000000000000..8fe9d0f72f268c672cd9997afca05ff76d889d22 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/31-c52bbc6c-916b-4d0d-be92-9b7fc9eb2ba0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/310-b181dfff-2a63-429b-9732-0813fb6f3998.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/310-b181dfff-2a63-429b-9732-0813fb6f3998.txn new file mode 100644 index 0000000000000000000000000000000000000000..22e21a97be325e0aa0b1da922cfaff77895f9c46 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/310-b181dfff-2a63-429b-9732-0813fb6f3998.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/311-9982099c-5ae7-47ea-b8d6-c3fa97922e87.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/311-9982099c-5ae7-47ea-b8d6-c3fa97922e87.txn new file mode 100644 index 0000000000000000000000000000000000000000..69fa64440ccb2a6381692456dde63f998cced86d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/311-9982099c-5ae7-47ea-b8d6-c3fa97922e87.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/312-09b9cab4-795f-44a4-ae94-d5fd2763c74b.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/312-09b9cab4-795f-44a4-ae94-d5fd2763c74b.txn new file mode 100644 index 0000000000000000000000000000000000000000..611dfb2d7fda1837ffa89b46594f5d3953ba48a1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/312-09b9cab4-795f-44a4-ae94-d5fd2763c74b.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/313-0f68a3ea-97e9-4893-aa5d-4a40e5beeca3.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/313-0f68a3ea-97e9-4893-aa5d-4a40e5beeca3.txn new file mode 100644 index 0000000000000000000000000000000000000000..0426810bfb2173c2f314466bd63658c740f317fd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/313-0f68a3ea-97e9-4893-aa5d-4a40e5beeca3.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/314-4b9d8786-168c-4069-a9f6-3e04ec92ed84.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/314-4b9d8786-168c-4069-a9f6-3e04ec92ed84.txn new file mode 100644 index 0000000000000000000000000000000000000000..50e3bffbaead9795e03328e019afcb5ba49c5b76 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/314-4b9d8786-168c-4069-a9f6-3e04ec92ed84.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/315-1e981265-eaab-49f7-ac93-5b77d58012f6.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/315-1e981265-eaab-49f7-ac93-5b77d58012f6.txn new file mode 100644 index 0000000000000000000000000000000000000000..4929f56e0b6023b3628b54155377209bf39ec544 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/315-1e981265-eaab-49f7-ac93-5b77d58012f6.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/316-b929c950-ca9e-4ec3-ab3f-252b2de2e2a9.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/316-b929c950-ca9e-4ec3-ab3f-252b2de2e2a9.txn new file mode 100644 index 0000000000000000000000000000000000000000..516c8de7ca899742e7a627e3f6556bcb1ba485a9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/316-b929c950-ca9e-4ec3-ab3f-252b2de2e2a9.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/317-c901d421-fb89-4d14-8797-c229a02d10ae.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/317-c901d421-fb89-4d14-8797-c229a02d10ae.txn new file mode 100644 index 0000000000000000000000000000000000000000..62d8c1a397184460161283c3bfe8d64545fc8717 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/317-c901d421-fb89-4d14-8797-c229a02d10ae.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/318-8751970a-b80d-4ab5-884a-ce0325936ca4.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/318-8751970a-b80d-4ab5-884a-ce0325936ca4.txn new file mode 100644 index 0000000000000000000000000000000000000000..2a6f32f0c26cfbabca44b8f94191abc3694f650f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/318-8751970a-b80d-4ab5-884a-ce0325936ca4.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/319-911f4236-6c2a-48d0-a18c-c913e1c412ab.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/319-911f4236-6c2a-48d0-a18c-c913e1c412ab.txn new file mode 100644 index 0000000000000000000000000000000000000000..9018acc6c502b5fd21a27d824d90e1a24f954a37 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/319-911f4236-6c2a-48d0-a18c-c913e1c412ab.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/32-66b9f02f-a8f5-45d3-87ee-6ebd8d5db96a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/32-66b9f02f-a8f5-45d3-87ee-6ebd8d5db96a.txn new file mode 100644 index 0000000000000000000000000000000000000000..2ad993f0bbef2c4eb997d7b026a337c6eac90837 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/32-66b9f02f-a8f5-45d3-87ee-6ebd8d5db96a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/320-a7bd57ef-4b8d-4c14-a132-588f7603caf2.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/320-a7bd57ef-4b8d-4c14-a132-588f7603caf2.txn new file mode 100644 index 0000000000000000000000000000000000000000..8dd19c73183b0f692875a84b73bee2c1b3608376 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/320-a7bd57ef-4b8d-4c14-a132-588f7603caf2.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/321-75f03639-6586-4c6e-bade-30adedfa5834.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/321-75f03639-6586-4c6e-bade-30adedfa5834.txn new file mode 100644 index 0000000000000000000000000000000000000000..79728604e33bac33ae7b89b5fbfb42243402fe37 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/321-75f03639-6586-4c6e-bade-30adedfa5834.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/322-89111554-9f11-41f7-9903-d4cf37c55d3a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/322-89111554-9f11-41f7-9903-d4cf37c55d3a.txn new file mode 100644 index 0000000000000000000000000000000000000000..f6979a6d0b29f47d6d958964938e9dda9ce6ea0f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/322-89111554-9f11-41f7-9903-d4cf37c55d3a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/323-fb6350d9-6ccb-49a3-b9fd-cf447bfb8a81.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/323-fb6350d9-6ccb-49a3-b9fd-cf447bfb8a81.txn new file mode 100644 index 0000000000000000000000000000000000000000..22401578fb243ec5efba6d5e36cb059565ee6020 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/323-fb6350d9-6ccb-49a3-b9fd-cf447bfb8a81.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/324-ad43158e-7c99-4ec9-bc54-b255bd082d44.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/324-ad43158e-7c99-4ec9-bc54-b255bd082d44.txn new file mode 100644 index 0000000000000000000000000000000000000000..d08ed8c4fee69874b68fc28eab5b1b1bead56010 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/324-ad43158e-7c99-4ec9-bc54-b255bd082d44.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/325-3e4a63a4-c616-4495-8c9a-4c229b1ce898.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/325-3e4a63a4-c616-4495-8c9a-4c229b1ce898.txn new file mode 100644 index 0000000000000000000000000000000000000000..9e73fe50dfb0e18d2afa79eec449db598f884db5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/325-3e4a63a4-c616-4495-8c9a-4c229b1ce898.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/326-c5ccb8e7-6e43-4ed1-9f43-8c3707b18673.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/326-c5ccb8e7-6e43-4ed1-9f43-8c3707b18673.txn new file mode 100644 index 0000000000000000000000000000000000000000..9ec3b5fa1f8ced531f99e3f45cde11ef2c09524b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/326-c5ccb8e7-6e43-4ed1-9f43-8c3707b18673.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/327-66b95175-88bb-40d3-98ef-11d6b15f9037.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/327-66b95175-88bb-40d3-98ef-11d6b15f9037.txn new file mode 100644 index 0000000000000000000000000000000000000000..353383acec81f68b94f6fbe140f16a0aabbc6139 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/327-66b95175-88bb-40d3-98ef-11d6b15f9037.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/328-e14bb10d-7ef5-42fc-9b75-595b562579e2.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/328-e14bb10d-7ef5-42fc-9b75-595b562579e2.txn new file mode 100644 index 0000000000000000000000000000000000000000..bdd2dca9f6d4aeb75c6d634ec44c27bba678fb5a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/328-e14bb10d-7ef5-42fc-9b75-595b562579e2.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/329-add5aeb6-df2b-4724-bc6a-f49830143f1c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/329-add5aeb6-df2b-4724-bc6a-f49830143f1c.txn new file mode 100644 index 0000000000000000000000000000000000000000..3e592b32c601db92f1be3f9d5c630f102f896f3b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/329-add5aeb6-df2b-4724-bc6a-f49830143f1c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/33-ef1070b5-1ade-44b7-923f-1cecc6663930.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/33-ef1070b5-1ade-44b7-923f-1cecc6663930.txn new file mode 100644 index 0000000000000000000000000000000000000000..37897f59e42f782f5362e0d3eb38ae47ce13612e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/33-ef1070b5-1ade-44b7-923f-1cecc6663930.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/330-8dc9b06d-299e-4bef-94fc-45c12bfd11c0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/330-8dc9b06d-299e-4bef-94fc-45c12bfd11c0.txn new file mode 100644 index 0000000000000000000000000000000000000000..b0b061fdcf10abe3bfcfc563af3596725488cd42 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/330-8dc9b06d-299e-4bef-94fc-45c12bfd11c0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/331-ddbae6ed-fafb-4590-820a-b1ca6a128534.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/331-ddbae6ed-fafb-4590-820a-b1ca6a128534.txn new file mode 100644 index 0000000000000000000000000000000000000000..99094cf23d15b1da6e0a0f30b955177b0b4b1d4c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/331-ddbae6ed-fafb-4590-820a-b1ca6a128534.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/332-9429347c-5d7e-4c73-a992-6eb2e872e431.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/332-9429347c-5d7e-4c73-a992-6eb2e872e431.txn new file mode 100644 index 0000000000000000000000000000000000000000..db93bd9df85e5c51cd8a9f955e2ddc73f83998ae Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/332-9429347c-5d7e-4c73-a992-6eb2e872e431.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/333-4b5e5dab-bd3c-4f20-9cd3-4ccb3767053f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/333-4b5e5dab-bd3c-4f20-9cd3-4ccb3767053f.txn new file mode 100644 index 0000000000000000000000000000000000000000..8aa50daff7856889cdb1aea5d064b0a8f2082ee3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/333-4b5e5dab-bd3c-4f20-9cd3-4ccb3767053f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/334-5f107759-78f9-40d9-bf6a-4c673a3d7d4c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/334-5f107759-78f9-40d9-bf6a-4c673a3d7d4c.txn new file mode 100644 index 0000000000000000000000000000000000000000..0f0f4c60c51709a5fe9e615d07c1b221467c5941 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/334-5f107759-78f9-40d9-bf6a-4c673a3d7d4c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/335-e8faa500-a49f-4671-ad9b-51b54761bac5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/335-e8faa500-a49f-4671-ad9b-51b54761bac5.txn new file mode 100644 index 0000000000000000000000000000000000000000..0676d848e0eb3a22a75b743b86107f5fb936fa10 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/335-e8faa500-a49f-4671-ad9b-51b54761bac5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/336-a803b439-b736-47bb-ae30-17ef82bd4a86.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/336-a803b439-b736-47bb-ae30-17ef82bd4a86.txn new file mode 100644 index 0000000000000000000000000000000000000000..25f34dee7cc810d795c1555054c796dd5d9b1769 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/336-a803b439-b736-47bb-ae30-17ef82bd4a86.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/337-a54ac73a-8125-42aa-9771-7eb9723dae16.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/337-a54ac73a-8125-42aa-9771-7eb9723dae16.txn new file mode 100644 index 0000000000000000000000000000000000000000..d3df9f1cecfe8e09a476b12e20eaa6ae18647235 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/337-a54ac73a-8125-42aa-9771-7eb9723dae16.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/338-61bb0992-821c-4ade-ad31-d37b7ffd8cd8.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/338-61bb0992-821c-4ade-ad31-d37b7ffd8cd8.txn new file mode 100644 index 0000000000000000000000000000000000000000..a15f2fed0747b38e5a94fe279d42be546c19b8a0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/338-61bb0992-821c-4ade-ad31-d37b7ffd8cd8.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/339-20eae741-43d4-4077-b553-ad104c82d198.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/339-20eae741-43d4-4077-b553-ad104c82d198.txn new file mode 100644 index 0000000000000000000000000000000000000000..90444f0a6baafb0a946d11e578669f524a12cdb2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/339-20eae741-43d4-4077-b553-ad104c82d198.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/34-26d932bd-45f9-47cb-8725-1196685b44ac.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/34-26d932bd-45f9-47cb-8725-1196685b44ac.txn new file mode 100644 index 0000000000000000000000000000000000000000..72bb255e8e071bf56d531c33b9c89cb027342851 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/34-26d932bd-45f9-47cb-8725-1196685b44ac.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/340-6263c94b-93f5-4a03-88a9-9d6b418b03ce.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/340-6263c94b-93f5-4a03-88a9-9d6b418b03ce.txn new file mode 100644 index 0000000000000000000000000000000000000000..3ba8990034d9a7fa0042a5dcd8a624b2b600c367 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/340-6263c94b-93f5-4a03-88a9-9d6b418b03ce.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/341-76853561-7c9d-4c4b-898d-12c334988726.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/341-76853561-7c9d-4c4b-898d-12c334988726.txn new file mode 100644 index 0000000000000000000000000000000000000000..92a88292a131e4fa5f2cf261f667cb54a18622c5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/341-76853561-7c9d-4c4b-898d-12c334988726.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/342-9b7320ea-95ac-464e-a0a0-ea67f05d58f1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/342-9b7320ea-95ac-464e-a0a0-ea67f05d58f1.txn new file mode 100644 index 0000000000000000000000000000000000000000..344698dda94a1013b942c4d982a2963335233ad0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/342-9b7320ea-95ac-464e-a0a0-ea67f05d58f1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/343-f575f2bd-2284-40cd-8f29-b220eb71f032.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/343-f575f2bd-2284-40cd-8f29-b220eb71f032.txn new file mode 100644 index 0000000000000000000000000000000000000000..3e9f3b738c60a7b65a2ad7338fa00fe155a55356 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/343-f575f2bd-2284-40cd-8f29-b220eb71f032.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/344-bdd5845e-674c-4f61-9877-82b9aba413b4.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/344-bdd5845e-674c-4f61-9877-82b9aba413b4.txn new file mode 100644 index 0000000000000000000000000000000000000000..df43e9ed2a064d54df357f8112b292c2e2a2ddff Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/344-bdd5845e-674c-4f61-9877-82b9aba413b4.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/345-9357623c-32ef-4ba0-b05d-af22bb8574a4.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/345-9357623c-32ef-4ba0-b05d-af22bb8574a4.txn new file mode 100644 index 0000000000000000000000000000000000000000..e5601356a33de80346dbe7dac82d37a55a2f2330 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/345-9357623c-32ef-4ba0-b05d-af22bb8574a4.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/346-1c72753e-6e08-4099-9ac8-28a1ac37569f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/346-1c72753e-6e08-4099-9ac8-28a1ac37569f.txn new file mode 100644 index 0000000000000000000000000000000000000000..fbad277682f98e1aeefcab9f1c9a27cbbad4234a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/346-1c72753e-6e08-4099-9ac8-28a1ac37569f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/347-a403fc8b-fcc9-4a4e-95e8-c3dea0be3c1e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/347-a403fc8b-fcc9-4a4e-95e8-c3dea0be3c1e.txn new file mode 100644 index 0000000000000000000000000000000000000000..6b1121a02df1ecb8cc357efd1d656c8f2c12ac87 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/347-a403fc8b-fcc9-4a4e-95e8-c3dea0be3c1e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/348-92cf5d6a-a370-4d21-a262-c8b7c670ba0e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/348-92cf5d6a-a370-4d21-a262-c8b7c670ba0e.txn new file mode 100644 index 0000000000000000000000000000000000000000..2fb4690b04646d3d756677caa2624e99ec38dcd8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/348-92cf5d6a-a370-4d21-a262-c8b7c670ba0e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/349-3d1e784d-3b73-474d-8665-b18f0caf5ad5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/349-3d1e784d-3b73-474d-8665-b18f0caf5ad5.txn new file mode 100644 index 0000000000000000000000000000000000000000..f2996437f72b8f6759d1f8a4047b578f45d93170 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/349-3d1e784d-3b73-474d-8665-b18f0caf5ad5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/35-10e054eb-6683-41ac-8ca4-f79e6906bc21.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/35-10e054eb-6683-41ac-8ca4-f79e6906bc21.txn new file mode 100644 index 0000000000000000000000000000000000000000..3e382d3bd4eeecb8da100d6dd533c6098b9e8b68 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/35-10e054eb-6683-41ac-8ca4-f79e6906bc21.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/350-836862a9-3b65-4419-89e8-f3dd6ec3ad92.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/350-836862a9-3b65-4419-89e8-f3dd6ec3ad92.txn new file mode 100644 index 0000000000000000000000000000000000000000..93c59cda299df7362197f6ac9830f1bf028c20e6 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/350-836862a9-3b65-4419-89e8-f3dd6ec3ad92.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/351-4749999c-a981-40b2-847b-778e37424ca6.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/351-4749999c-a981-40b2-847b-778e37424ca6.txn new file mode 100644 index 0000000000000000000000000000000000000000..bb822948fceeee953f99341523a2b2d2add50c41 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/351-4749999c-a981-40b2-847b-778e37424ca6.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/352-6f0a05f8-1059-4d4e-bd48-a356c10c2037.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/352-6f0a05f8-1059-4d4e-bd48-a356c10c2037.txn new file mode 100644 index 0000000000000000000000000000000000000000..d7339026fa6ef5e652bebe49ff0c8164819a0efd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/352-6f0a05f8-1059-4d4e-bd48-a356c10c2037.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/353-0dc298b7-9cd7-4608-9103-796c695173ec.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/353-0dc298b7-9cd7-4608-9103-796c695173ec.txn new file mode 100644 index 0000000000000000000000000000000000000000..17fc31306367d4730622d0a4407a1c7f98a0f1c0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/353-0dc298b7-9cd7-4608-9103-796c695173ec.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/354-d534c03e-46d7-480f-a6c9-402ad2488dc1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/354-d534c03e-46d7-480f-a6c9-402ad2488dc1.txn new file mode 100644 index 0000000000000000000000000000000000000000..c0c6cb8fa4e0ba2eeda863c76fc874debdfa800c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/354-d534c03e-46d7-480f-a6c9-402ad2488dc1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/355-69b30bb7-4352-4acb-a732-a6ff80464370.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/355-69b30bb7-4352-4acb-a732-a6ff80464370.txn new file mode 100644 index 0000000000000000000000000000000000000000..cd1a9a4d308f06559a97f178b054f8135bba9e78 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/355-69b30bb7-4352-4acb-a732-a6ff80464370.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/356-5f0be210-9b2c-423c-976e-209e88833bb1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/356-5f0be210-9b2c-423c-976e-209e88833bb1.txn new file mode 100644 index 0000000000000000000000000000000000000000..e7e28f07f15584113bb6cabc40c937afaaa18818 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/356-5f0be210-9b2c-423c-976e-209e88833bb1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/357-4b2b2d6d-4f6f-4d9e-b1df-e0072ef294c4.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/357-4b2b2d6d-4f6f-4d9e-b1df-e0072ef294c4.txn new file mode 100644 index 0000000000000000000000000000000000000000..76d23a3890760d66c2e491d9783ad719f5e9a850 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/357-4b2b2d6d-4f6f-4d9e-b1df-e0072ef294c4.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/358-4c831d44-ce65-4fd0-bab8-1605646fc1de.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/358-4c831d44-ce65-4fd0-bab8-1605646fc1de.txn new file mode 100644 index 0000000000000000000000000000000000000000..c8d9f1dbb26ff404b225209bbeb6ee252ae4db3e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/358-4c831d44-ce65-4fd0-bab8-1605646fc1de.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/359-68fbd75b-45c4-4e11-8f05-8b960b00beb0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/359-68fbd75b-45c4-4e11-8f05-8b960b00beb0.txn new file mode 100644 index 0000000000000000000000000000000000000000..72ed5c61738c92fb7ad5af0fe666ea04015fde7a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/359-68fbd75b-45c4-4e11-8f05-8b960b00beb0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/36-ba363e2f-c036-422f-9ef6-05be59bb2f40.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/36-ba363e2f-c036-422f-9ef6-05be59bb2f40.txn new file mode 100644 index 0000000000000000000000000000000000000000..0b9f6a6122fc3ce7a526706ec1ca2c75e1c6018d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/36-ba363e2f-c036-422f-9ef6-05be59bb2f40.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/360-afb04f33-a59d-456e-94c8-d1435d3334ec.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/360-afb04f33-a59d-456e-94c8-d1435d3334ec.txn new file mode 100644 index 0000000000000000000000000000000000000000..186392fc897e3fb338722181bc42cea0eab22ee8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/360-afb04f33-a59d-456e-94c8-d1435d3334ec.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/361-b0a7de4c-9476-47d8-8997-479fe8bff303.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/361-b0a7de4c-9476-47d8-8997-479fe8bff303.txn new file mode 100644 index 0000000000000000000000000000000000000000..ffdef4c0860029a0855a79b4c1fb8e7d15cac61d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/361-b0a7de4c-9476-47d8-8997-479fe8bff303.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/362-a91da242-e0f9-45f4-9b16-abf283c0b114.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/362-a91da242-e0f9-45f4-9b16-abf283c0b114.txn new file mode 100644 index 0000000000000000000000000000000000000000..cd847a2adb5f2c6352f4e0c030f1855951294bfa Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/362-a91da242-e0f9-45f4-9b16-abf283c0b114.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/363-c1577139-9854-4a50-9193-8b073d3ecb7e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/363-c1577139-9854-4a50-9193-8b073d3ecb7e.txn new file mode 100644 index 0000000000000000000000000000000000000000..118527385b8a610093e61dbfe75c81d8e0f7b1c2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/363-c1577139-9854-4a50-9193-8b073d3ecb7e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/364-f1e496c4-5c02-4fe6-98cd-7ec12ec6f376.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/364-f1e496c4-5c02-4fe6-98cd-7ec12ec6f376.txn new file mode 100644 index 0000000000000000000000000000000000000000..a6f34869ab9ed8bee991f18ea464637d77741fdf Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/364-f1e496c4-5c02-4fe6-98cd-7ec12ec6f376.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/365-b5091bbd-d0ef-407d-80b8-437e9a014091.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/365-b5091bbd-d0ef-407d-80b8-437e9a014091.txn new file mode 100644 index 0000000000000000000000000000000000000000..7b56ce70f295c33239e079822ec5b8888b1a031a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/365-b5091bbd-d0ef-407d-80b8-437e9a014091.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/366-02ee61bb-e770-4b86-8a91-302315ea2aac.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/366-02ee61bb-e770-4b86-8a91-302315ea2aac.txn new file mode 100644 index 0000000000000000000000000000000000000000..f5f43493796b8a11be311c7aebc629351b8ed285 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/366-02ee61bb-e770-4b86-8a91-302315ea2aac.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/367-50881298-d0e7-45d3-9b8d-aedf690d33f9.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/367-50881298-d0e7-45d3-9b8d-aedf690d33f9.txn new file mode 100644 index 0000000000000000000000000000000000000000..4c9873b5d1cc951566e623efd1cc06b71c0ed3ea Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/367-50881298-d0e7-45d3-9b8d-aedf690d33f9.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/368-34c714bc-4c60-48f5-b5ca-d5124918eea6.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/368-34c714bc-4c60-48f5-b5ca-d5124918eea6.txn new file mode 100644 index 0000000000000000000000000000000000000000..002aba13cddb2c61c1e1e8e7145a68be9f6c4502 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/368-34c714bc-4c60-48f5-b5ca-d5124918eea6.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/369-a54dbf16-8bec-4264-91d3-60a78b40273d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/369-a54dbf16-8bec-4264-91d3-60a78b40273d.txn new file mode 100644 index 0000000000000000000000000000000000000000..4dd6603597798764754bf80e5b10b58aa6b0ebaf Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/369-a54dbf16-8bec-4264-91d3-60a78b40273d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/37-f21c8cbe-86f6-4e6e-a745-35bbec4fa5b9.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/37-f21c8cbe-86f6-4e6e-a745-35bbec4fa5b9.txn new file mode 100644 index 0000000000000000000000000000000000000000..3da63bae3481a8ba30455f22ca5ab74696813eef Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/37-f21c8cbe-86f6-4e6e-a745-35bbec4fa5b9.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/370-d09a5e36-ef72-49c0-985b-ba98d883d5ec.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/370-d09a5e36-ef72-49c0-985b-ba98d883d5ec.txn new file mode 100644 index 0000000000000000000000000000000000000000..6d63eea6f75655b5909bf559d2c1a81334a6562b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/370-d09a5e36-ef72-49c0-985b-ba98d883d5ec.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/371-c4fabeb7-179d-4a6b-9013-b6e8cbf91807.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/371-c4fabeb7-179d-4a6b-9013-b6e8cbf91807.txn new file mode 100644 index 0000000000000000000000000000000000000000..ad6962af00e554492ccb88536c1342e97ba0bd36 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/371-c4fabeb7-179d-4a6b-9013-b6e8cbf91807.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/372-4584352e-04f2-4bc3-b32f-6835032b0e7f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/372-4584352e-04f2-4bc3-b32f-6835032b0e7f.txn new file mode 100644 index 0000000000000000000000000000000000000000..dbfe6dea2b6b68f3a076862f20620f32cc530153 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/372-4584352e-04f2-4bc3-b32f-6835032b0e7f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/373-5ad7f097-72dc-4bd8-aa9d-3c13a06d6e6c.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/373-5ad7f097-72dc-4bd8-aa9d-3c13a06d6e6c.txn new file mode 100644 index 0000000000000000000000000000000000000000..5d260ce22fd0e4f7ab392b31021db617e464ae82 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/373-5ad7f097-72dc-4bd8-aa9d-3c13a06d6e6c.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/374-4b66de76-f968-402b-9ee5-892308ae28cc.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/374-4b66de76-f968-402b-9ee5-892308ae28cc.txn new file mode 100644 index 0000000000000000000000000000000000000000..1d8c1424f652458c253cdb81fae83d8826bed7a9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/374-4b66de76-f968-402b-9ee5-892308ae28cc.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/375-6ef3a6ec-cba7-40d7-b6ed-1458a44f9df2.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/375-6ef3a6ec-cba7-40d7-b6ed-1458a44f9df2.txn new file mode 100644 index 0000000000000000000000000000000000000000..76f3ae9461a83cfd572579fa526c18b2156ede29 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/375-6ef3a6ec-cba7-40d7-b6ed-1458a44f9df2.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/376-63fdd2dd-aa7a-49f4-ba76-fd06f93ce7e3.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/376-63fdd2dd-aa7a-49f4-ba76-fd06f93ce7e3.txn new file mode 100644 index 0000000000000000000000000000000000000000..ee23931927d377529146f58be94181ce287e2de4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/376-63fdd2dd-aa7a-49f4-ba76-fd06f93ce7e3.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/377-08e0fbf2-2dbf-46be-a8bf-2d90af70c092.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/377-08e0fbf2-2dbf-46be-a8bf-2d90af70c092.txn new file mode 100644 index 0000000000000000000000000000000000000000..5460f78b93c053a810d20aec11993999beff65d0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/377-08e0fbf2-2dbf-46be-a8bf-2d90af70c092.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/378-9bde3dbc-fdeb-40c3-a042-e24d11369fc6.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/378-9bde3dbc-fdeb-40c3-a042-e24d11369fc6.txn new file mode 100644 index 0000000000000000000000000000000000000000..1bc2e0eba0edea05984084b83c51dab3633d5e65 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/378-9bde3dbc-fdeb-40c3-a042-e24d11369fc6.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/379-c37f00cc-30fc-40f7-897f-30e0e46bfe54.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/379-c37f00cc-30fc-40f7-897f-30e0e46bfe54.txn new file mode 100644 index 0000000000000000000000000000000000000000..46939e7ac814d60493a167cc38472c339146c673 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/379-c37f00cc-30fc-40f7-897f-30e0e46bfe54.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/38-ecf57582-1cb0-4bf4-9642-5ae4c92ac44a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/38-ecf57582-1cb0-4bf4-9642-5ae4c92ac44a.txn new file mode 100644 index 0000000000000000000000000000000000000000..a5b19a10e12882be42201364cc7c1c87fddc1037 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/38-ecf57582-1cb0-4bf4-9642-5ae4c92ac44a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/380-7d41946f-fc2e-46af-8387-0a990040e7e1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/380-7d41946f-fc2e-46af-8387-0a990040e7e1.txn new file mode 100644 index 0000000000000000000000000000000000000000..652692fc1dc18b23e1e2d9a56a07f1345c67395f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/380-7d41946f-fc2e-46af-8387-0a990040e7e1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/381-e73f7664-57b5-480e-ae00-ab3852ed5340.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/381-e73f7664-57b5-480e-ae00-ab3852ed5340.txn new file mode 100644 index 0000000000000000000000000000000000000000..1acf3915db53fb7a5d419d822789de472e22db10 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/381-e73f7664-57b5-480e-ae00-ab3852ed5340.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/382-68265fdc-c4df-440f-ba20-aafc9fbe046f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/382-68265fdc-c4df-440f-ba20-aafc9fbe046f.txn new file mode 100644 index 0000000000000000000000000000000000000000..942e301557008987559c0cdd020ceb67a3263307 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/382-68265fdc-c4df-440f-ba20-aafc9fbe046f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/383-6e088f6f-7f1a-4450-b24d-088bc4573f6f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/383-6e088f6f-7f1a-4450-b24d-088bc4573f6f.txn new file mode 100644 index 0000000000000000000000000000000000000000..32cc8375a9322084d059c949029c07edd4f11630 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/383-6e088f6f-7f1a-4450-b24d-088bc4573f6f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/384-19d75932-ede2-42b0-b1ae-f1a75d2291a4.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/384-19d75932-ede2-42b0-b1ae-f1a75d2291a4.txn new file mode 100644 index 0000000000000000000000000000000000000000..110e812a133113e5c63a8f52efe1b0624956b7f2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/384-19d75932-ede2-42b0-b1ae-f1a75d2291a4.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/385-7a0821e0-969e-4032-8019-31d9e781038d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/385-7a0821e0-969e-4032-8019-31d9e781038d.txn new file mode 100644 index 0000000000000000000000000000000000000000..d76cb2aab87a88413884a3581b28f10286a53e4b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/385-7a0821e0-969e-4032-8019-31d9e781038d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/386-0f62b37e-eee1-48dd-8dc2-378ba09390a7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/386-0f62b37e-eee1-48dd-8dc2-378ba09390a7.txn new file mode 100644 index 0000000000000000000000000000000000000000..278e02f7076f3b1870f232c56e6030ac9d5e8ed0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/386-0f62b37e-eee1-48dd-8dc2-378ba09390a7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/387-eaa1a12e-02ad-4c4f-afeb-a6843b1ed710.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/387-eaa1a12e-02ad-4c4f-afeb-a6843b1ed710.txn new file mode 100644 index 0000000000000000000000000000000000000000..42146853ee302b92e5db22fe375efb9b710f4f8f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/387-eaa1a12e-02ad-4c4f-afeb-a6843b1ed710.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/388-871f1865-8180-4753-bbdc-6f48ee8353ca.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/388-871f1865-8180-4753-bbdc-6f48ee8353ca.txn new file mode 100644 index 0000000000000000000000000000000000000000..1092d1ce405937d3d997adf447ae699f18d0f1d5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/388-871f1865-8180-4753-bbdc-6f48ee8353ca.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/389-9d5d16b0-0a56-48ff-b8b2-03388fe51824.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/389-9d5d16b0-0a56-48ff-b8b2-03388fe51824.txn new file mode 100644 index 0000000000000000000000000000000000000000..744b2260c2fa92b4a87250132669343c261786a4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/389-9d5d16b0-0a56-48ff-b8b2-03388fe51824.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/39-12bb0761-a16f-4a76-9adf-f7f1732f68d0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/39-12bb0761-a16f-4a76-9adf-f7f1732f68d0.txn new file mode 100644 index 0000000000000000000000000000000000000000..f105687c41223a0de0ceac6a6dcb50bb34e4594b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/39-12bb0761-a16f-4a76-9adf-f7f1732f68d0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/390-74dafcf8-624c-437f-965b-681ff8144223.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/390-74dafcf8-624c-437f-965b-681ff8144223.txn new file mode 100644 index 0000000000000000000000000000000000000000..ec10322902d525c284f0dcebf4cf0d07099bf994 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/390-74dafcf8-624c-437f-965b-681ff8144223.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/391-8bf8525a-da2c-4e2e-b995-16699a7386e0.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/391-8bf8525a-da2c-4e2e-b995-16699a7386e0.txn new file mode 100644 index 0000000000000000000000000000000000000000..910c447838c3c5f9d5d8cf630acd84e1aca6a2b4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/391-8bf8525a-da2c-4e2e-b995-16699a7386e0.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/392-9a9d0260-613f-420b-9ebc-df9728cd1483.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/392-9a9d0260-613f-420b-9ebc-df9728cd1483.txn new file mode 100644 index 0000000000000000000000000000000000000000..2bdf74ed1a009ec2fd783ba5d1987d42b2583bd8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/392-9a9d0260-613f-420b-9ebc-df9728cd1483.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/393-6bf55b65-effa-4600-95c4-8ec7cb4ec576.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/393-6bf55b65-effa-4600-95c4-8ec7cb4ec576.txn new file mode 100644 index 0000000000000000000000000000000000000000..fff561d38bc842a9f8d3aad3c3aec0840381edfb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/393-6bf55b65-effa-4600-95c4-8ec7cb4ec576.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/394-36f42e1d-d39d-4018-a155-9c6d520223c6.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/394-36f42e1d-d39d-4018-a155-9c6d520223c6.txn new file mode 100644 index 0000000000000000000000000000000000000000..a0d776cc56b4832f93121afccc205aef99e32742 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/394-36f42e1d-d39d-4018-a155-9c6d520223c6.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/4-e4ee9968-7a6c-46f3-8a00-318925d1f9a4.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/4-e4ee9968-7a6c-46f3-8a00-318925d1f9a4.txn new file mode 100644 index 0000000000000000000000000000000000000000..f183f97bb05931dc2d7eaabba8177201a887c107 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/4-e4ee9968-7a6c-46f3-8a00-318925d1f9a4.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/40-a729e27b-429a-4c7b-a2ed-e913ce19325b.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/40-a729e27b-429a-4c7b-a2ed-e913ce19325b.txn new file mode 100644 index 0000000000000000000000000000000000000000..5059f8d571fc94defe446f0faec565bd581d3418 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/40-a729e27b-429a-4c7b-a2ed-e913ce19325b.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/41-d96e15e8-ca23-47c1-8c6a-79040ffa7a1e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/41-d96e15e8-ca23-47c1-8c6a-79040ffa7a1e.txn new file mode 100644 index 0000000000000000000000000000000000000000..54bfd28ad070e8ce3162289ec77242ee6f29b4d2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/41-d96e15e8-ca23-47c1-8c6a-79040ffa7a1e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/42-94619806-f9b0-4165-b3cf-7acf0bab39ac.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/42-94619806-f9b0-4165-b3cf-7acf0bab39ac.txn new file mode 100644 index 0000000000000000000000000000000000000000..3f32636fdf5f8c71063e2647fe52bec82a2b563c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/42-94619806-f9b0-4165-b3cf-7acf0bab39ac.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/43-5c908a18-d646-40aa-a1ef-5cc9a8239ca2.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/43-5c908a18-d646-40aa-a1ef-5cc9a8239ca2.txn new file mode 100644 index 0000000000000000000000000000000000000000..26aeedc130c124becb5356edf4d6b4daf2a53c19 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/43-5c908a18-d646-40aa-a1ef-5cc9a8239ca2.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/44-509366c4-d9f6-4022-806d-5190a73b6573.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/44-509366c4-d9f6-4022-806d-5190a73b6573.txn new file mode 100644 index 0000000000000000000000000000000000000000..3958fca0ab1d688e0a5dcb66de736e14fb51696b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/44-509366c4-d9f6-4022-806d-5190a73b6573.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/45-c7f421eb-8739-4c9c-82a1-8651eb8116aa.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/45-c7f421eb-8739-4c9c-82a1-8651eb8116aa.txn new file mode 100644 index 0000000000000000000000000000000000000000..2a12d5d5da464da677a3524313c0f3663dd6b14e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/45-c7f421eb-8739-4c9c-82a1-8651eb8116aa.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/46-eefef128-27ef-4560-a2bf-1a51f5f0921a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/46-eefef128-27ef-4560-a2bf-1a51f5f0921a.txn new file mode 100644 index 0000000000000000000000000000000000000000..a209eb46eab22586002fdb47ae5f36bb6febea7b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/46-eefef128-27ef-4560-a2bf-1a51f5f0921a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/47-99d93a96-e0d7-4b85-a1f6-7e035a415af7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/47-99d93a96-e0d7-4b85-a1f6-7e035a415af7.txn new file mode 100644 index 0000000000000000000000000000000000000000..52606c5a795fcd9616c446e10c018ad0f15b7f0b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/47-99d93a96-e0d7-4b85-a1f6-7e035a415af7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/48-a70aebff-01e4-4b11-a859-1fcc2878eac8.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/48-a70aebff-01e4-4b11-a859-1fcc2878eac8.txn new file mode 100644 index 0000000000000000000000000000000000000000..c94bca24707fb0ef2b669b89000245f2b34cde17 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/48-a70aebff-01e4-4b11-a859-1fcc2878eac8.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/49-4d3499ea-7154-4548-b64f-f4bc3b808f4d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/49-4d3499ea-7154-4548-b64f-f4bc3b808f4d.txn new file mode 100644 index 0000000000000000000000000000000000000000..ce714ec09ed671356711fbf78343e9d51fdb852d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/49-4d3499ea-7154-4548-b64f-f4bc3b808f4d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/5-5a983e70-3d49-4ee7-9a44-25bc4dff55a3.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/5-5a983e70-3d49-4ee7-9a44-25bc4dff55a3.txn new file mode 100644 index 0000000000000000000000000000000000000000..18257ba7de8604f70e2ba59f6aa00d1bdc0088f8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/5-5a983e70-3d49-4ee7-9a44-25bc4dff55a3.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/50-96b41a48-fc20-43d7-9e2c-2d941ed9d6cd.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/50-96b41a48-fc20-43d7-9e2c-2d941ed9d6cd.txn new file mode 100644 index 0000000000000000000000000000000000000000..a31beb3ce33b947fdfa75c97445744c25d70e2da Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/50-96b41a48-fc20-43d7-9e2c-2d941ed9d6cd.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/51-5be21c1b-6b48-4eca-8f8e-ecb54565e4c6.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/51-5be21c1b-6b48-4eca-8f8e-ecb54565e4c6.txn new file mode 100644 index 0000000000000000000000000000000000000000..df38c92e0e7aa6f330bb0e810b063a175e573b25 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/51-5be21c1b-6b48-4eca-8f8e-ecb54565e4c6.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/52-5d446b78-32c5-4cdb-91e5-5ae8043d920b.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/52-5d446b78-32c5-4cdb-91e5-5ae8043d920b.txn new file mode 100644 index 0000000000000000000000000000000000000000..6f13dd0f7cc918edb957dcdb554968e397e5522f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/52-5d446b78-32c5-4cdb-91e5-5ae8043d920b.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/53-310f0828-268b-4811-b919-cb026e20ca73.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/53-310f0828-268b-4811-b919-cb026e20ca73.txn new file mode 100644 index 0000000000000000000000000000000000000000..fd4d97c2781f486998bb8a7197f497d3531dad74 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/53-310f0828-268b-4811-b919-cb026e20ca73.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/54-4639ec45-ca8b-4856-b907-7503e1aab6fb.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/54-4639ec45-ca8b-4856-b907-7503e1aab6fb.txn new file mode 100644 index 0000000000000000000000000000000000000000..c591a1ffbc7bc595a88976527e9992050b4ef151 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/54-4639ec45-ca8b-4856-b907-7503e1aab6fb.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/55-ea6cf0fa-8387-4c4b-a733-4d2764f82ae5.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/55-ea6cf0fa-8387-4c4b-a733-4d2764f82ae5.txn new file mode 100644 index 0000000000000000000000000000000000000000..8dfe32d4ba34f758cf28c0782caf5d830cf95617 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/55-ea6cf0fa-8387-4c4b-a733-4d2764f82ae5.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/56-74df09be-b2da-4908-9ec5-03da9ffb6804.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/56-74df09be-b2da-4908-9ec5-03da9ffb6804.txn new file mode 100644 index 0000000000000000000000000000000000000000..c5adeb5b0e6fd4eb1ee0c59ebd920d4cb2761556 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/56-74df09be-b2da-4908-9ec5-03da9ffb6804.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/57-c5efa305-1349-415f-a56a-16f88ec8692a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/57-c5efa305-1349-415f-a56a-16f88ec8692a.txn new file mode 100644 index 0000000000000000000000000000000000000000..6e6abc3249b868c86d1b8b9145b43f1c7e00d732 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/57-c5efa305-1349-415f-a56a-16f88ec8692a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/58-61efcff6-ec14-4ec2-9c96-8c06405e7f70.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/58-61efcff6-ec14-4ec2-9c96-8c06405e7f70.txn new file mode 100644 index 0000000000000000000000000000000000000000..cf54c4a670bc292a6a78367790d991965120f7b5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/58-61efcff6-ec14-4ec2-9c96-8c06405e7f70.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/59-9d20fe9d-cb60-44b2-9f9c-9d464b0eb189.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/59-9d20fe9d-cb60-44b2-9f9c-9d464b0eb189.txn new file mode 100644 index 0000000000000000000000000000000000000000..935657118f3b80e4ae4eb91d019c0cb5757ed65e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/59-9d20fe9d-cb60-44b2-9f9c-9d464b0eb189.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/6-39efb2ee-aa9e-4bd3-91f9-6a8e49075761.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/6-39efb2ee-aa9e-4bd3-91f9-6a8e49075761.txn new file mode 100644 index 0000000000000000000000000000000000000000..6ae98906fbdca3365c68d77005d173ee53d5bb01 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/6-39efb2ee-aa9e-4bd3-91f9-6a8e49075761.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/60-e90d58f1-af54-431c-8e7c-df1d02871a27.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/60-e90d58f1-af54-431c-8e7c-df1d02871a27.txn new file mode 100644 index 0000000000000000000000000000000000000000..49ff9d34747abc6cbd3e4836840d75338e431561 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/60-e90d58f1-af54-431c-8e7c-df1d02871a27.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/61-1aa23423-9413-44a3-a60a-9ba092a28d2b.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/61-1aa23423-9413-44a3-a60a-9ba092a28d2b.txn new file mode 100644 index 0000000000000000000000000000000000000000..9ee846634baefa9b684996755fef3ed890f1c47f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/61-1aa23423-9413-44a3-a60a-9ba092a28d2b.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/62-7aeaaf28-cafe-45a6-93be-b96448f7fce7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/62-7aeaaf28-cafe-45a6-93be-b96448f7fce7.txn new file mode 100644 index 0000000000000000000000000000000000000000..df89514e1af8cd002ebf0614c91cb2a77762872a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/62-7aeaaf28-cafe-45a6-93be-b96448f7fce7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/63-22b90732-eb5e-4bab-9648-add06b1e3243.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/63-22b90732-eb5e-4bab-9648-add06b1e3243.txn new file mode 100644 index 0000000000000000000000000000000000000000..bf02ba6fe58d098dcc3dee5090e868e08caf49e5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/63-22b90732-eb5e-4bab-9648-add06b1e3243.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/64-eb6931bf-6ce1-4b72-bb36-14261a40051f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/64-eb6931bf-6ce1-4b72-bb36-14261a40051f.txn new file mode 100644 index 0000000000000000000000000000000000000000..1fe8d1dc563a99d51c996a09af2caefd14536aae Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/64-eb6931bf-6ce1-4b72-bb36-14261a40051f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/65-c5e07d60-1f70-47b4-b82e-fa49bd5f483f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/65-c5e07d60-1f70-47b4-b82e-fa49bd5f483f.txn new file mode 100644 index 0000000000000000000000000000000000000000..fcfead4003a438269633d57d92ec24e8b387d00a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/65-c5e07d60-1f70-47b4-b82e-fa49bd5f483f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/66-d1d5e85a-2d06-4e06-9bca-5f4dac42f110.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/66-d1d5e85a-2d06-4e06-9bca-5f4dac42f110.txn new file mode 100644 index 0000000000000000000000000000000000000000..5382247f5906b9059ce5ca0c7d48edf5075e6b0f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/66-d1d5e85a-2d06-4e06-9bca-5f4dac42f110.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/67-3de61837-7f94-49be-b596-ac75fe31cb01.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/67-3de61837-7f94-49be-b596-ac75fe31cb01.txn new file mode 100644 index 0000000000000000000000000000000000000000..a7397e1c3010748f1add5fdd2e5ef92ec0f83d7d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/67-3de61837-7f94-49be-b596-ac75fe31cb01.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/68-d211f17e-a2c4-46b7-8be1-7f5cc37a1da1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/68-d211f17e-a2c4-46b7-8be1-7f5cc37a1da1.txn new file mode 100644 index 0000000000000000000000000000000000000000..74f96b3dbff117b23d4d49c21248b0a64adf8a11 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/68-d211f17e-a2c4-46b7-8be1-7f5cc37a1da1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/69-fc5ac8d5-7af2-4970-a514-8be32580d162.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/69-fc5ac8d5-7af2-4970-a514-8be32580d162.txn new file mode 100644 index 0000000000000000000000000000000000000000..62cbf62e2e378b327e4869ca13df3cdbc6f36f29 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/69-fc5ac8d5-7af2-4970-a514-8be32580d162.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/7-71447671-87c7-4697-809c-38dfd8708b12.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/7-71447671-87c7-4697-809c-38dfd8708b12.txn new file mode 100644 index 0000000000000000000000000000000000000000..63dd69ec43ba18b357eea343393914722fcfafa5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/7-71447671-87c7-4697-809c-38dfd8708b12.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/70-5a5c9ce5-df49-48bc-8204-70329bd74faa.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/70-5a5c9ce5-df49-48bc-8204-70329bd74faa.txn new file mode 100644 index 0000000000000000000000000000000000000000..350503829ce75bd87abc0aa0605aaf08d0fa4432 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/70-5a5c9ce5-df49-48bc-8204-70329bd74faa.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/71-8a8b58e2-4391-448a-a11f-c8aa55decaad.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/71-8a8b58e2-4391-448a-a11f-c8aa55decaad.txn new file mode 100644 index 0000000000000000000000000000000000000000..019f21193397057e93fe0e98bb95d412dc74c799 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/71-8a8b58e2-4391-448a-a11f-c8aa55decaad.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/72-d8df3012-5322-4cbb-a80c-2c627acabd84.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/72-d8df3012-5322-4cbb-a80c-2c627acabd84.txn new file mode 100644 index 0000000000000000000000000000000000000000..59a2fd08aeba5245945e4462a25a5b294207cdd5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/72-d8df3012-5322-4cbb-a80c-2c627acabd84.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/73-26b8a5ce-7c7b-41a0-b3a1-190b1d4ad057.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/73-26b8a5ce-7c7b-41a0-b3a1-190b1d4ad057.txn new file mode 100644 index 0000000000000000000000000000000000000000..8e5ff40dfbad304cfaad7bc9d7cd05403b53de8d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/73-26b8a5ce-7c7b-41a0-b3a1-190b1d4ad057.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/74-84fc0eca-3e12-4762-846b-f7d1c0cd166d.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/74-84fc0eca-3e12-4762-846b-f7d1c0cd166d.txn new file mode 100644 index 0000000000000000000000000000000000000000..1eaf04f3face014405c5834116a5329e54e8fb28 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/74-84fc0eca-3e12-4762-846b-f7d1c0cd166d.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/75-aa8a0941-0888-420a-9473-8493535599ec.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/75-aa8a0941-0888-420a-9473-8493535599ec.txn new file mode 100644 index 0000000000000000000000000000000000000000..497fdfaa345f5babe588a62ec5ff5251a5e6290b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/75-aa8a0941-0888-420a-9473-8493535599ec.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/76-7310067b-d73c-400d-a667-7fdae82b83d7.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/76-7310067b-d73c-400d-a667-7fdae82b83d7.txn new file mode 100644 index 0000000000000000000000000000000000000000..152a4b7b624b8eb196f3fb95db9a9ae1899ac29c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/76-7310067b-d73c-400d-a667-7fdae82b83d7.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/77-ce2982c3-20e2-4cf4-b698-77acf79b9f87.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/77-ce2982c3-20e2-4cf4-b698-77acf79b9f87.txn new file mode 100644 index 0000000000000000000000000000000000000000..5a2e5c8f8bf94356ee72b5cb3dc3d18b4ac0b045 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/77-ce2982c3-20e2-4cf4-b698-77acf79b9f87.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/78-b2e701d8-7a18-43ee-9a73-027567be8c8f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/78-b2e701d8-7a18-43ee-9a73-027567be8c8f.txn new file mode 100644 index 0000000000000000000000000000000000000000..1f0edd33b81555d3945698c6aed9189daa028ecf Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/78-b2e701d8-7a18-43ee-9a73-027567be8c8f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/79-6f3ccadd-b622-4d25-a494-57760f3c4abb.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/79-6f3ccadd-b622-4d25-a494-57760f3c4abb.txn new file mode 100644 index 0000000000000000000000000000000000000000..76c437ad99fbd901bafeba516efa149f32843fbc Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/79-6f3ccadd-b622-4d25-a494-57760f3c4abb.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/8-8f5fc73c-8921-4fb5-ad58-c0cc9c0c761a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/8-8f5fc73c-8921-4fb5-ad58-c0cc9c0c761a.txn new file mode 100644 index 0000000000000000000000000000000000000000..f7043e59f51ef4a46dfdcd20e6c76a8aaabbd473 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/8-8f5fc73c-8921-4fb5-ad58-c0cc9c0c761a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/80-66759ab0-3300-424b-bfc8-5986bad937b6.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/80-66759ab0-3300-424b-bfc8-5986bad937b6.txn new file mode 100644 index 0000000000000000000000000000000000000000..906151d2edeb9e15db9e1fef6bc630caf54cd210 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/80-66759ab0-3300-424b-bfc8-5986bad937b6.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/81-2ba37e38-7959-4453-ab52-bf65cbec9e5f.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/81-2ba37e38-7959-4453-ab52-bf65cbec9e5f.txn new file mode 100644 index 0000000000000000000000000000000000000000..be0b62f5c6eb5dc3132a1d4dece2639fc2c69d21 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/81-2ba37e38-7959-4453-ab52-bf65cbec9e5f.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/82-5ca6f6a6-8700-4c0c-96d2-de5d5cef2a0a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/82-5ca6f6a6-8700-4c0c-96d2-de5d5cef2a0a.txn new file mode 100644 index 0000000000000000000000000000000000000000..03fa1a991e5469c9cbb311bdcb4b56ab1305eec8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/82-5ca6f6a6-8700-4c0c-96d2-de5d5cef2a0a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/83-a72cb098-6773-4083-92ff-9102b69ae9d2.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/83-a72cb098-6773-4083-92ff-9102b69ae9d2.txn new file mode 100644 index 0000000000000000000000000000000000000000..b8a4c53914604ff0537313fd06cd837d6e8aff12 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/83-a72cb098-6773-4083-92ff-9102b69ae9d2.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/84-abd382f1-b841-4e78-bd1d-3a76fa539731.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/84-abd382f1-b841-4e78-bd1d-3a76fa539731.txn new file mode 100644 index 0000000000000000000000000000000000000000..b0dd3ae2899e68f14882e6bb310876338a791089 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/84-abd382f1-b841-4e78-bd1d-3a76fa539731.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/85-da98612a-1033-4aac-8ffe-67f1dccd9711.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/85-da98612a-1033-4aac-8ffe-67f1dccd9711.txn new file mode 100644 index 0000000000000000000000000000000000000000..c45e9b1ea2102ae483b4c96ed162f1040bb60424 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/85-da98612a-1033-4aac-8ffe-67f1dccd9711.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/86-055b33cb-7531-42b7-8a2a-ece33f770b90.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/86-055b33cb-7531-42b7-8a2a-ece33f770b90.txn new file mode 100644 index 0000000000000000000000000000000000000000..a2db4b284d993539cc0b368d27c5a84bbbccead9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/86-055b33cb-7531-42b7-8a2a-ece33f770b90.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/87-f243714f-bf20-40bd-bd33-ecc24aff0845.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/87-f243714f-bf20-40bd-bd33-ecc24aff0845.txn new file mode 100644 index 0000000000000000000000000000000000000000..763b2dde8bf7fad79b1640781f351573d76b0163 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/87-f243714f-bf20-40bd-bd33-ecc24aff0845.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/88-d5f85fed-4105-45e7-b2c0-adc3df5ae399.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/88-d5f85fed-4105-45e7-b2c0-adc3df5ae399.txn new file mode 100644 index 0000000000000000000000000000000000000000..36d7b3c6ed6fde3f35c6a4776d251d4faa60b7e7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/88-d5f85fed-4105-45e7-b2c0-adc3df5ae399.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/89-5c53f528-24da-43bd-bb09-0b53d58ee919.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/89-5c53f528-24da-43bd-bb09-0b53d58ee919.txn new file mode 100644 index 0000000000000000000000000000000000000000..4e9f892b5b13a0409979fee06c255d384c1b752a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/89-5c53f528-24da-43bd-bb09-0b53d58ee919.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/9-bdbb2186-3363-48c1-8941-9766582ae99e.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/9-bdbb2186-3363-48c1-8941-9766582ae99e.txn new file mode 100644 index 0000000000000000000000000000000000000000..1b63019ab4527ea78fd48f711f18f2e2ac40aea3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/9-bdbb2186-3363-48c1-8941-9766582ae99e.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/90-d7dba54d-d8d4-4b5a-a385-24d1e4599e5a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/90-d7dba54d-d8d4-4b5a-a385-24d1e4599e5a.txn new file mode 100644 index 0000000000000000000000000000000000000000..5b522eaf999d3968d22b0c4722c81f7c214c027c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/90-d7dba54d-d8d4-4b5a-a385-24d1e4599e5a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/91-54e024b2-8526-4350-a1f7-f6d83f94eb35.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/91-54e024b2-8526-4350-a1f7-f6d83f94eb35.txn new file mode 100644 index 0000000000000000000000000000000000000000..b33896f8c4f03455dadc8c6a5e3145c39c648238 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/91-54e024b2-8526-4350-a1f7-f6d83f94eb35.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/92-1fb8e99a-338e-40a7-9bf8-6073e9659c05.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/92-1fb8e99a-338e-40a7-9bf8-6073e9659c05.txn new file mode 100644 index 0000000000000000000000000000000000000000..2a54d7712fcbcb16e8d44c6506b197467f21cada Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/92-1fb8e99a-338e-40a7-9bf8-6073e9659c05.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/93-a868593d-91ee-4deb-a3d4-45189125b676.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/93-a868593d-91ee-4deb-a3d4-45189125b676.txn new file mode 100644 index 0000000000000000000000000000000000000000..ecce720605da427d204547f0052b8d8a7eefebc6 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/93-a868593d-91ee-4deb-a3d4-45189125b676.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/94-31ae82ae-b15d-4c31-81f9-8e6c44f47085.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/94-31ae82ae-b15d-4c31-81f9-8e6c44f47085.txn new file mode 100644 index 0000000000000000000000000000000000000000..2564d46fc07480d0d9180fb6740d0158d367e763 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/94-31ae82ae-b15d-4c31-81f9-8e6c44f47085.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/95-63fa891d-44df-4d11-8f66-753c3a76c9b1.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/95-63fa891d-44df-4d11-8f66-753c3a76c9b1.txn new file mode 100644 index 0000000000000000000000000000000000000000..f89e62616b6ff4438525fd1583b4972df4062698 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/95-63fa891d-44df-4d11-8f66-753c3a76c9b1.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/96-d82566a7-c41d-4e77-9f61-fdd4dd3beaed.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/96-d82566a7-c41d-4e77-9f61-fdd4dd3beaed.txn new file mode 100644 index 0000000000000000000000000000000000000000..84fd8f4770f368fe01185ba163eb0f4115ce700e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/96-d82566a7-c41d-4e77-9f61-fdd4dd3beaed.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/97-57489f76-cb4a-40ae-ba61-0a0f2fb4ca2a.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/97-57489f76-cb4a-40ae-ba61-0a0f2fb4ca2a.txn new file mode 100644 index 0000000000000000000000000000000000000000..a3c881af61fc730f98076c85ef966c30eba126b5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/97-57489f76-cb4a-40ae-ba61-0a0f2fb4ca2a.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/98-18218dec-94d9-49d4-9e33-7bed47bbb360.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/98-18218dec-94d9-49d4-9e33-7bed47bbb360.txn new file mode 100644 index 0000000000000000000000000000000000000000..3bcebf13ff3ffecf69d76fc57e7071105d2e34df Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/98-18218dec-94d9-49d4-9e33-7bed47bbb360.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_transactions/99-9cf63421-c84d-4f14-8508-284a897798dc.txn b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/99-9cf63421-c84d-4f14-8508-284a897798dc.txn new file mode 100644 index 0000000000000000000000000000000000000000..90e6a3190a8f089c068ca69670780714cf6708aa Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_transactions/99-9cf63421-c84d-4f14-8508-284a897798dc.txn differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/1.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/1.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f8a8c643318405c99f166f8faa0f5f03fc598c57 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/1.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/10.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/10.manifest new file mode 100644 index 0000000000000000000000000000000000000000..941f129b8b95fb449690a12060196d1feef63c90 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/10.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/100.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/100.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c76cce693d8521f2736114dbcdfbec0941accb2f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/100.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/101.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/101.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ce8fa91f026fc9fb49005b39f66acf6da67194e7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/101.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/102.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/102.manifest new file mode 100644 index 0000000000000000000000000000000000000000..df5fbcb606b8274ac60fa316c1e8e005c041c423 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/102.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/103.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/103.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d8b1a4c5835058918321d0323653cd794d0cdcff Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/103.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/104.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/104.manifest new file mode 100644 index 0000000000000000000000000000000000000000..eb590497581abe762002a527418deffd9adf0acf Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/104.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/105.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/105.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c855355e11dfed6f167129f84b3b7d9363e0501a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/105.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/106.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/106.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7c51a966c678cf9f91d6467640d20f42b28dcbbd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/106.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/107.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/107.manifest new file mode 100644 index 0000000000000000000000000000000000000000..133ac06c576b6e4c866ca12fd4f22eb05ae4b76e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/107.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/108.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/108.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2becd605d514d3b43ee3938b6f498a2f9cab3065 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/108.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/109.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/109.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b93d870f2a48564142f3b866d9b41e4aa15855d4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/109.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/11.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/11.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e27df9a36d2131f63b1732d17e73b42bba584cc9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/11.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/110.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/110.manifest new file mode 100644 index 0000000000000000000000000000000000000000..062a61ab8350a8bf639c20738f2fa29c99ef79c5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/110.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/111.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/111.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cbc38329a8c0c99576ec5d4f24de62b63d7cc633 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/111.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/112.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/112.manifest new file mode 100644 index 0000000000000000000000000000000000000000..177513020340dfb374a359bdfbeb2e543df44208 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/112.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/113.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/113.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1b66922b06607f30d9773a6892eb3e182d124640 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/113.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/114.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/114.manifest new file mode 100644 index 0000000000000000000000000000000000000000..95f4d91d41b5b0bb2d5396c592fccc3695674016 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/114.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/115.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/115.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fd54554c1615e756a1412d9eb06ccb8506724368 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/115.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/116.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/116.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c395406e7a9ca8531206e305d08dc2355e0ae8b1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/116.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/117.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/117.manifest new file mode 100644 index 0000000000000000000000000000000000000000..005f6fa0eaad06dd80756aa13e85c3f1de9ad9e4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/117.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/118.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/118.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c123439286239c5a8cc63a29cb94464aaad81412 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/118.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/119.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/119.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bd3560203679ebcf7d95babcf9733f73a5eb573c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/119.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/12.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/12.manifest new file mode 100644 index 0000000000000000000000000000000000000000..076c02c4059e963d2cd7cdd7dd60743fc5918f29 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/12.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/120.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/120.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2ece944e0aa6c864d0d660fd463aee1f9c631b36 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/120.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/121.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/121.manifest new file mode 100644 index 0000000000000000000000000000000000000000..07b0a168ccfe7c528bf0cdaf1486509150a73d40 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/121.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/122.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/122.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e4eed2fad8a4a655fc65f36240b1bebd64c38f98 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/122.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/123.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/123.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8391236d2bf1de8f55ae35102836917089e6a4ff Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/123.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/124.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/124.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9783f18ffe1c0fe300d405dc1776951e1c21ec48 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/124.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/125.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/125.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1f312c1c896f7c90a2c1763be94c3284007c7e12 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/125.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/126.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/126.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5604d4db76aaae5ae29abd5329b436f7923347e5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/126.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/127.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/127.manifest new file mode 100644 index 0000000000000000000000000000000000000000..06ff5f55a62a272763e5d63d17b728b94a4e9b93 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/127.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/128.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/128.manifest new file mode 100644 index 0000000000000000000000000000000000000000..963e6d4dba059a1bef0e5f84490d25162c209aff Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/128.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/129.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/129.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2c7716e8cfaae771b7724a13a6d6bc56482e2153 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/129.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/13.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/13.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a2881dd7946200160f805acc914138dc6c860bc1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/13.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/130.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/130.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9e76d69db3740c7c33dcd54e597d1ebef7d6d6db Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/130.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/131.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/131.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fc2603b2d8e2d91dea2efd6d6496fa822f91d30a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/131.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/132.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/132.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e38a716d1de8a5bad5607e93f7dba058bc98fdd9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/132.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/133.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/133.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bd664892339da7a4e20a1bc0cfa108b0dc9dca8e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/133.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/134.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/134.manifest new file mode 100644 index 0000000000000000000000000000000000000000..70eeacc018ece28d6da717d215539d88d4576ab5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/134.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/135.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/135.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8473bcf73c237629371002b9372437cb5c241b40 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/135.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/136.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/136.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0f6838355610bb5d0b5cab2cdf4998d097f2dae7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/136.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/137.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/137.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7489a116fe82e819d0f1e212f8fbdb4196679228 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/137.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/138.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/138.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2dc61f50b8d9f7f6cc17aac512bcf98843ac2598 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/138.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/139.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/139.manifest new file mode 100644 index 0000000000000000000000000000000000000000..70c424a2e99be760664e1dd0828d3a8ebde1ddaa Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/139.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/14.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/14.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6e9a4643cd3d4ed88ae1fd36196c59002912fd09 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/14.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/140.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/140.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1456b386b17049cdf0c79a5c3adfcaaef93db6b3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/140.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/141.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/141.manifest new file mode 100644 index 0000000000000000000000000000000000000000..624250215082587c43b602571f3fc5adf8006179 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/141.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/142.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/142.manifest new file mode 100644 index 0000000000000000000000000000000000000000..73f8ff3560046e06633b8ee7ca37feca82db4305 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/142.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/143.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/143.manifest new file mode 100644 index 0000000000000000000000000000000000000000..764fb2b5c9bfa1804a17072a455205e718718e7c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/143.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/144.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/144.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c713f383b2d026746dd8a51b87ac34d730248445 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/144.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/145.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/145.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c92b34ec108b9ff0254f299c0b4e347385d951ed Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/145.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/146.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/146.manifest new file mode 100644 index 0000000000000000000000000000000000000000..91b3e852c88aabbffadd79f6d179b0fc21f26761 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/146.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/147.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/147.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e4e11d1ea9bf044447fceba87f1a8eb36ed0279f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/147.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/148.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/148.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0bf8e419796c51d80d4e22fadebf2e3b3b05bfc3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/148.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/149.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/149.manifest new file mode 100644 index 0000000000000000000000000000000000000000..09d48953bf0aa11c6bde1c9c15eeec8d142730ec Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/149.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/15.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/15.manifest new file mode 100644 index 0000000000000000000000000000000000000000..02af86e209ac06d2f0b8da8537322d810339704e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/15.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/150.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/150.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ba80e4330070406f56994a5e4f659fbc1b6b1e30 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/150.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/151.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/151.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9f473224d039f04d087c1d2d853dcf980084a279 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/151.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/152.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/152.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d16b2f5c1949f74ee1db8a3e5a9360af0627e14b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/152.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/153.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/153.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ac4b8c16c9a926a474d6a9ce7ae7528d1274c974 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/153.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/154.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/154.manifest new file mode 100644 index 0000000000000000000000000000000000000000..218f6a240782d34a724ba6ffc6898475fdfa7d8a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/154.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/155.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/155.manifest new file mode 100644 index 0000000000000000000000000000000000000000..05a51b88c40f0a6d9fa7c276b3252928e7e46a08 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/155.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/156.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/156.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b6ca9f2f337cf9af810aa1047d3a412e2c881f57 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/156.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/157.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/157.manifest new file mode 100644 index 0000000000000000000000000000000000000000..60548f12749d81e3d198d0c321062c2c1888cfef Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/157.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/158.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/158.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3676eaf938218d0c5681678508ac8bd3aeaa2703 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/158.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/159.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/159.manifest new file mode 100644 index 0000000000000000000000000000000000000000..034398e5e3f71ae8deec4daa6ef69b03ec72a00c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/159.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/16.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/16.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6a2b10e7cea515557bd826a4d46fd40b03d99b95 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/16.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/160.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/160.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cef0cc80f94911f175e62912911f7901c3fc0ff9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/160.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/161.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/161.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b50d071d65a28f877ee49c4fec0cae965c30f7aa Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/161.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/162.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/162.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b5969b73f613648077c52d34c9386c8ea8ea7b45 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/162.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/163.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/163.manifest new file mode 100644 index 0000000000000000000000000000000000000000..510c683f4e3b4d7afeabd887b37dccefd4482ebc Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/163.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/164.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/164.manifest new file mode 100644 index 0000000000000000000000000000000000000000..120fc5f37ba7c98c2142d08a5dcb6952763a2dfb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/164.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/165.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/165.manifest new file mode 100644 index 0000000000000000000000000000000000000000..59ead4b0a369ed4627c18a90e828d0e1792b37ac Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/165.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/166.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/166.manifest new file mode 100644 index 0000000000000000000000000000000000000000..66ea399979ea21651dc5608eb875d9438ccde5bf Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/166.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/167.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/167.manifest new file mode 100644 index 0000000000000000000000000000000000000000..29789dae3527702e335c48b2874c631941702ce3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/167.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/168.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/168.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9edd9ebf8f68e1535b37d9f09a6b09d55bb32627 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/168.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/169.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/169.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b90aa52f7a44e9f72bec8a58587e74074b3eca52 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/169.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/17.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/17.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fec4050ad3b1451f0d813b9306d75863fa277fc1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/17.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/170.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/170.manifest new file mode 100644 index 0000000000000000000000000000000000000000..70cc44b3389b5be35f2ef5cffc8e6f1ab6ce45fe Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/170.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/171.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/171.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b38e3b266cd3e977a8fe5133f4f72b1cbc23b997 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/171.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/172.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/172.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6897a7c5d6ff414467d1a2aae499cba25ceb4f0b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/172.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/173.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/173.manifest new file mode 100644 index 0000000000000000000000000000000000000000..104c9ced104e9030723b45d414a1fc54d11c21a2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/173.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/174.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/174.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c5aaeeb2e2c7e08c8baeb9607618852f0b9eaa6f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/174.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/175.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/175.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6316bc7e7452c26a9f4b2f1a4890dadba11b907a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/175.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/176.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/176.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cdcfed33469c70662f0f950a870989198e51eaf9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/176.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/177.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/177.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a11b9280c36590383ac1401bdbe3ce91378e7732 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/177.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/178.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/178.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0625e839bb2a9461b817d935cab47c93bafe08af Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/178.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/179.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/179.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6f24ed33882e43be21ea9e801e9eba30ddba24ea Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/179.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/18.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/18.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e85d3ece912bfc3166899b4a417fd222f13ca6f1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/18.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/180.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/180.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8a7663b65858008630cc2f8e90aa2626104fd260 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/180.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/181.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/181.manifest new file mode 100644 index 0000000000000000000000000000000000000000..104adcddacf89cc3e5a4efcb9683c0e48cc605e3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/181.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/182.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/182.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ed9ceeac1164a150616e70f8464428eb149a50d8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/182.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/183.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/183.manifest new file mode 100644 index 0000000000000000000000000000000000000000..da6dc030f7a6966984ac50965501ffe856aa045e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/183.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/184.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/184.manifest new file mode 100644 index 0000000000000000000000000000000000000000..10935844571c54d45208c379cbec37fd40df9467 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/184.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/185.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/185.manifest new file mode 100644 index 0000000000000000000000000000000000000000..72d249aeb4e3df8ac03e8f3572aa82106d84d6f4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/185.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/186.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/186.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1eed06e6d74d576063f178c385c5309513754df4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/186.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/187.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/187.manifest new file mode 100644 index 0000000000000000000000000000000000000000..60e352f7e89f8ebb3525f1fb892a716149c88166 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/187.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/188.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/188.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c7fde6db37ce0065efcd8e1dd2a59c2c9fcaadde Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/188.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/189.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/189.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3d3ce84f70c1fe8339f4b609805f85d643fecf98 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/189.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/19.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/19.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7113f08dfa1c2a45920598ad91690da625cea1dc Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/19.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/190.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/190.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b222e2c84db867ab0c6306ff545f41944d6dab5d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/190.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/191.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/191.manifest new file mode 100644 index 0000000000000000000000000000000000000000..225fce248ee9fe8f37fa369cbb2b28e0da94d7ed Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/191.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/192.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/192.manifest new file mode 100644 index 0000000000000000000000000000000000000000..805a09f8e059e2c84fd6eb727f6432c8740070d0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/192.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/193.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/193.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a4830f414459f709e4aa4ae7bbf4aa283fbcb43b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/193.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/194.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/194.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0e42c1b187d1ca4eacc4cb1f488d8a35f1324de6 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/194.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/195.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/195.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6d4833db2a0349631e3a9dc29a31616e0bc3faa5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/195.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/196.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/196.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a0d99463cd9447de400f98355b6109141188bcee Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/196.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/197.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/197.manifest new file mode 100644 index 0000000000000000000000000000000000000000..79ef8ab5f785c632994cc7c3fe2743ca25618f30 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/197.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/198.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/198.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ecd22610f53167395830bb2b6ea6f74bef1ea45c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/198.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/199.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/199.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e3b17a38f2eb40b058c9e9dc52327d49ff1dcc60 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/199.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/2.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/2.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0036f55661414867c8a712657aac729595276d63 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/2.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/20.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/20.manifest new file mode 100644 index 0000000000000000000000000000000000000000..58915754f382f6d76c5969aa22819bff8337856e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/20.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/200.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/200.manifest new file mode 100644 index 0000000000000000000000000000000000000000..918b19a9c9818677229e4afe971a23c70aea9aae Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/200.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/201.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/201.manifest new file mode 100644 index 0000000000000000000000000000000000000000..40abd1b1aa55f56a3f2278526c1d7899c79cf20b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/201.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/202.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/202.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c39bfd13dbbb2e862afc9e5e3a9c8073bab3a5e4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/202.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/203.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/203.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5e6cf73ed484fbb571f637fa4de967c6a460da1a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/203.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/204.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/204.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6e0ed1c221fd9b423aa787952e97abd4fc5b5a52 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/204.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/205.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/205.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c2b3f3e1120ad6802e691fa1965895332f1ec153 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/205.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/206.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/206.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b9e9819750855a10264a5c921f02f03e40107ee3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/206.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/207.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/207.manifest new file mode 100644 index 0000000000000000000000000000000000000000..81ea114efb2df1ffb99316d8cafc7c1a63685e5e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/207.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/208.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/208.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8bcd3438e893e89c60ab04847fef4fbcf0964293 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/208.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/209.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/209.manifest new file mode 100644 index 0000000000000000000000000000000000000000..daaf2b3ac3e8c7b9410ea25bc4362870400328a3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/209.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/21.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/21.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fa6416a6b4e06f49ff681a901553f39d083d3b18 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/21.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/210.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/210.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f2fa661bfd7ed36ee00e39078291a90ed597dab2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/210.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/211.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/211.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8c318b983c872575bf1b8b0ee8db4a3f20a0a66e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/211.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/212.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/212.manifest new file mode 100644 index 0000000000000000000000000000000000000000..35b13a40be86d27613b264fa0b7b238d5a8a84c1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/212.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/213.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/213.manifest new file mode 100644 index 0000000000000000000000000000000000000000..15a1871e8f02d12ffd380886fe4cbc6808065e5b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/213.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/214.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/214.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c7e7905b12ed75d7bf86f782f883e3123ad55c58 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/214.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/215.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/215.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1f3b7eaeff4387bcc10816a2d5e8c97a8e77f13b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/215.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/216.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/216.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b58568a7dc760cdeaf98c489a31d31ed5c9d11af Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/216.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/217.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/217.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ed2ddfad07a595dc0a125c8c9896641d1f2df9bd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/217.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/218.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/218.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4de89c0528c17a24c7111fed4afd93c95f141ed8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/218.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/219.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/219.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5a06426b4f0c726c103de8de60b200ed1029bd88 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/219.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/22.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/22.manifest new file mode 100644 index 0000000000000000000000000000000000000000..55751e5c70919e390de404a08c61047fec551ef1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/22.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/220.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/220.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4e7f8928362125cb0e56300ad9b30b8de53e5930 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/220.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/221.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/221.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6d03d3b9f75d6dc23ea8b353556c34d8f7839832 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/221.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/222.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/222.manifest new file mode 100644 index 0000000000000000000000000000000000000000..79ce8a99cc75c98fda417d44446262f067a5fb86 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/222.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/223.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/223.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c0d853f5e875f68704250050fbe7346b1b8891cb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/223.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/224.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/224.manifest new file mode 100644 index 0000000000000000000000000000000000000000..849013f05c9312a0f873e0d84e206309bcf2b04e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/224.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/225.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/225.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8548ba46f933947a37872a7bf540f72a0fe67dc8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/225.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/226.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/226.manifest new file mode 100644 index 0000000000000000000000000000000000000000..963e2ffcd7287ad393fc60e904a811998edcdb3b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/226.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/227.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/227.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d426111eaed95f4d51b0386eee31b17df5859066 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/227.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/228.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/228.manifest new file mode 100644 index 0000000000000000000000000000000000000000..af34ce7bdde9526ad77a1eeb38507b455a2e16ff Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/228.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/229.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/229.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ca1453e3d9c76c38368cf483ed32125715f5869b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/229.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/23.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/23.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bec6d1670190d494707be7220c3a61dfd789d05a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/23.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/230.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/230.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6499724996dd4a5a70f3a5b00e4358a0ef01b996 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/230.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/231.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/231.manifest new file mode 100644 index 0000000000000000000000000000000000000000..21402851875ccc67e0c740a65003fafc2ee14d81 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/231.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/232.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/232.manifest new file mode 100644 index 0000000000000000000000000000000000000000..572b00d665337191dc27d39033e6faa6af0e8312 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/232.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/233.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/233.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e84a6262cd6a5a3e5c0753de17b893997b5e69d9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/233.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/234.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/234.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d91d590f1808a75be280420d910bcb0e2f7f575c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/234.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/235.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/235.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7a1b95b3f772f08b67ed8444714057f010887c3c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/235.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/236.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/236.manifest new file mode 100644 index 0000000000000000000000000000000000000000..08a63c72206d0f2a420b1e22024797d632a66e27 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/236.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/237.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/237.manifest new file mode 100644 index 0000000000000000000000000000000000000000..76960c621b837e7399839d6710c12fa1bea3b1bd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/237.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/238.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/238.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9c1c066e9407d9cfc9da46ad943906fac57087d1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/238.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/239.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/239.manifest new file mode 100644 index 0000000000000000000000000000000000000000..381f428a5b20e6c72f72e818cecc6466fbc00df2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/239.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/24.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/24.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1717590c36adc09147893ecc0a7d971aebe475bc Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/24.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/240.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/240.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8e4e2e45c8a0e5b11a6fec348c68a821932a667e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/240.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/241.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/241.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0b0e1d1eee672271460442a08df3371a679586ee Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/241.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/242.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/242.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a026d1530bc69ac693e2f2d981db1be2c2f6d567 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/242.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/243.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/243.manifest new file mode 100644 index 0000000000000000000000000000000000000000..44f03791510482ada2086eb6d8d4b35c2449af48 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/243.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/244.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/244.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a3fe3c0ced5097937b3a4e1fb0161681cc6b170e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/244.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/245.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/245.manifest new file mode 100644 index 0000000000000000000000000000000000000000..85afb46f25c9544778f09cd98a19532667a7bb88 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/245.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/246.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/246.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6e07d530cb3b4ad5b0753018ced80bda53e68c44 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/246.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/247.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/247.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fa1c1795410ae52388da7b366c6f5aacf4419f68 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/247.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/248.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/248.manifest new file mode 100644 index 0000000000000000000000000000000000000000..86013b17943c8bba9cc0bb296612c9146822aef9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/248.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/249.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/249.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2607b25baca0bcb9059df65c067c8684ba402449 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/249.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/25.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/25.manifest new file mode 100644 index 0000000000000000000000000000000000000000..78c8690e61ee52579280ccb1be8e907ce0bda5ac Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/25.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/250.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/250.manifest new file mode 100644 index 0000000000000000000000000000000000000000..28239e6a8ce4a3900cf13869b7a07a5b227e53c5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/250.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/251.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/251.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2316c233104a3623c5bdf655d0720928752cf1aa Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/251.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/252.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/252.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2d6e6509166e5ceeb2e7ffe28ff8a64611e7a1ab Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/252.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/253.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/253.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1894b49dbc290952a498a78a2d3d71b0b371f76e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/253.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/254.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/254.manifest new file mode 100644 index 0000000000000000000000000000000000000000..71d075b147d4cc4891fb4bbee173a761a3bce617 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/254.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/255.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/255.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1e210b4cb5ebc3ec7e376d3160518ce3d00c65c1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/255.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/256.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/256.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8c8a5bbf2847e02060d02db4552f798cb0c6baac Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/256.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/257.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/257.manifest new file mode 100644 index 0000000000000000000000000000000000000000..407c6419d2b75d0e62ddd3e4ac266b961119d68c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/257.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/258.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/258.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fe30cc515973f46fc5395fd329fce7ae8ecae64e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/258.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/259.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/259.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ad3cb39e0725f6e684b9f7f9241b39d8f70d8f5c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/259.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/26.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/26.manifest new file mode 100644 index 0000000000000000000000000000000000000000..58f1f1f66bd114d7d5f24b4afc3d2ac9db9d1890 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/26.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/260.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/260.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2d51d821412487cdd90493373f518438d95974ce Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/260.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/261.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/261.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b8f881d62e9b64572bf769a3088880a8f664fe1b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/261.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/262.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/262.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a80cdb11fc5b4900c418f61f14e41b8ba3fa0ca7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/262.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/263.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/263.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ad582cc7fd94cc19677ac4fc5936cc926bb11d6e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/263.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/264.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/264.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f80d21026c5c61cb1b2918c3e08f5fc969ea2a1b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/264.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/265.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/265.manifest new file mode 100644 index 0000000000000000000000000000000000000000..44280d76942c0338e89e8d01e9aba0c51def62a3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/265.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/266.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/266.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1bdda995b06286cbb2e2103bdf6a1b90f598a2e2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/266.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/267.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/267.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cfb034815f5cac90820ab2575f8c264e6b5bdb69 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/267.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/268.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/268.manifest new file mode 100644 index 0000000000000000000000000000000000000000..554649fe052c43f5b188f891b346635e6e2f1bab Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/268.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/269.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/269.manifest new file mode 100644 index 0000000000000000000000000000000000000000..653d533c844fa9731640058a827799edf6057d1e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/269.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/27.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/27.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5f45f4f221e422d58574d0545952640e16271b57 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/27.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/270.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/270.manifest new file mode 100644 index 0000000000000000000000000000000000000000..448fc0690f069393cc838efab3e829b15d2b44e8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/270.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/271.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/271.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cd8d3ee1b64c10754bb531a65225e7ee026bc54d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/271.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/272.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/272.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9ccc0b275fb930edec1a7ec0488bdebb2683e079 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/272.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/273.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/273.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2c776fc6ffec7e7ab12ca8436f4c424b9ca9a279 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/273.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/274.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/274.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6eebd8a4cab8b80c2903e15d38d2ce733231291f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/274.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/275.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/275.manifest new file mode 100644 index 0000000000000000000000000000000000000000..967f1f6fbf341b5f02c406cdd9a14fd1c85787b9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/275.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/276.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/276.manifest new file mode 100644 index 0000000000000000000000000000000000000000..10aea9435118abcfece5c3b839745cdd76302d72 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/276.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/277.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/277.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4e91582dd1e307469efbe2d1662b61ba5661f9f2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/277.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/278.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/278.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a1ac048c37ea54080e786006a313909092eef91d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/278.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/279.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/279.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1ced3c727272ac307ee1565d500e6f899cc1d0bb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/279.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/28.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/28.manifest new file mode 100644 index 0000000000000000000000000000000000000000..53ea43d416687700feccaf90cab40545e446cabb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/28.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/280.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/280.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cf8c1a2065ec4cc4617c2b34a8accc76d04b1635 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/280.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/281.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/281.manifest new file mode 100644 index 0000000000000000000000000000000000000000..30d5ce7e0646ba64caf1c07d68d5ab8c27fffd6d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/281.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/282.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/282.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ec13e5e4a2d6b4d843a23ca85396f2fd4cc2d0e0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/282.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/283.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/283.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4d4d64aa7de0acb4e4042b1766d8385e2a53fbaa Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/283.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/284.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/284.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6a3ac518f3f72d71eb00de36455bf9d19f438087 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/284.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/285.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/285.manifest new file mode 100644 index 0000000000000000000000000000000000000000..82914098e023a1c79dfef9c3c311192cae993cbc Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/285.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/286.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/286.manifest new file mode 100644 index 0000000000000000000000000000000000000000..94bee7032e627b433933c3f16a4c2d41b86951d9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/286.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/287.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/287.manifest new file mode 100644 index 0000000000000000000000000000000000000000..da4f0d4c566e5dff2a616d37484f779a3d9d0f28 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/287.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/288.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/288.manifest new file mode 100644 index 0000000000000000000000000000000000000000..85679e4d230378608db6f207c04283a83c953fad Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/288.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/289.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/289.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f932ed8eee5251eba45696adce3ab44394f94972 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/289.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/29.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/29.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f65812b2885539ff1b3e5607b69092dc66f8af9f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/29.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/290.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/290.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b55bd171165d4f4f20f10f292cc40c545845e367 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/290.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/291.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/291.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a0ff17cb14ce47165b2b1b4d70692c8d563f533a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/291.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/292.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/292.manifest new file mode 100644 index 0000000000000000000000000000000000000000..00a33448d74460051b371b5edbffa3febf79b9ab Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/292.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/293.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/293.manifest new file mode 100644 index 0000000000000000000000000000000000000000..454c5cc7aa7a98ed8358023708a883b79da0a0d4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/293.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/294.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/294.manifest new file mode 100644 index 0000000000000000000000000000000000000000..767f07ade7a1380c1c3bd8aa29ec2381ad1875a8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/294.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/295.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/295.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4f0bce2d7901b4495171cfd01b8ef5c2fc2e7a02 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/295.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/296.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/296.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6201b9ff151323dcbdba6544c2d4ca53afa9e33c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/296.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/297.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/297.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5ef9e6d65d786d5b5f6fd41c5f32b345d780eb28 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/297.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/298.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/298.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7d549e5a76acc4258a90baa47e27b8c96212037e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/298.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/299.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/299.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d1ca0467a664f7186f37df8ab4317de38b2f926d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/299.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/3.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/3.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c8412ae17d25874745bb501349c8e0a03232a151 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/3.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/30.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/30.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1832203de278389825759942eab6aaad6a62643f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/30.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/300.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/300.manifest new file mode 100644 index 0000000000000000000000000000000000000000..156e0cd07002ca34212c3950ab55b49b55d4792b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/300.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/301.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/301.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a9d3d5c98494436aa46f58385d6e8f61f95891bd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/301.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/302.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/302.manifest new file mode 100644 index 0000000000000000000000000000000000000000..63d047fd172810fef9abb3599cd78f59564af859 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/302.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/303.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/303.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c4fc2343885f16034c796c6ed819a889c5d29f00 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/303.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/304.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/304.manifest new file mode 100644 index 0000000000000000000000000000000000000000..60c5db28e177c87fda662920a8509e86dffd9d52 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/304.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/305.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/305.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6c8104871cd25904f8931b35595d9c4a1b094d3f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/305.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/306.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/306.manifest new file mode 100644 index 0000000000000000000000000000000000000000..160c3015f79cb84aa241fe48c39f4c8b13f77b19 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/306.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/307.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/307.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b17176f30e50570e1a326214345b2ad2cbbb785a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/307.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/308.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/308.manifest new file mode 100644 index 0000000000000000000000000000000000000000..07c8cf162c613e8bbe87eccdc8b1829c592afd5f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/308.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/309.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/309.manifest new file mode 100644 index 0000000000000000000000000000000000000000..35ecbc08b9d388d384ede1bb8cc1d0a2c8d6f2b3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/309.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/31.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/31.manifest new file mode 100644 index 0000000000000000000000000000000000000000..88cfeed22325942290170038c30cf0fe0a97ff29 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/31.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/310.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/310.manifest new file mode 100644 index 0000000000000000000000000000000000000000..76f5b4885cd56ade29302ddb60c91526441eec08 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/310.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/311.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/311.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3f16cd28692c2e0f8e758bf8ef323677339c711b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/311.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/312.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/312.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6485b5e512ab0871faa55c1f233eb99bfb52b28e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/312.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/313.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/313.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bdbea6e66ebb7d39d8bc0e67ce13c434c1fb1e50 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/313.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/314.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/314.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0de5ea49aa7e0d867143ecf9e5d7feefa73f4652 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/314.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/315.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/315.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b8d3aaef21e76eaebdb92337ec99492e527c8a1f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/315.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/316.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/316.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6f94d7d814f71f61a305566e0cf74b76e624fcfb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/316.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/317.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/317.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dd361e8cbb4a343c0fdd0553a642d4951ea39b91 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/317.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/318.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/318.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6a954059efac7b3922275fdbdad58fa07efeda1d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/318.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/319.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/319.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b16fcaa24b4f70bbcd4aa8e8a99e4aac597a06d5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/319.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/32.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/32.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5a3c098c8144d9a72e8f002e7c1b721903ea1a1a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/32.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/320.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/320.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f52e9c1169565d918c92e4bf1d8081c8ba0a59ad Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/320.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/321.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/321.manifest new file mode 100644 index 0000000000000000000000000000000000000000..25e450969f5bb35ddfce2c19193df8ec2be4ab15 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/321.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/322.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/322.manifest new file mode 100644 index 0000000000000000000000000000000000000000..64b4f3cba213bb9a3a82f7fed954203adddb5b09 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/322.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/323.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/323.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2bf13fde7c1cdfb02c2454ffde9b8d64dcd88711 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/323.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/324.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/324.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8b60c9b0c02da8c19de6650cc3b16dd128cb3f3c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/324.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/325.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/325.manifest new file mode 100644 index 0000000000000000000000000000000000000000..642f31c99950f6823d485434e26816c35d91e091 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/325.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/326.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/326.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1e1931fbaadaa2bef86d249a9109b94d35e3af3d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/326.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/327.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/327.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9ca5e63ac120c2fc60310af7644286348fba6078 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/327.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/328.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/328.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a7104a452b230d801eb0ec0de0d68c2eb96e12b7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/328.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/329.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/329.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d3ee6acde9e5296ea0c0cef72baeb455f15fbd99 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/329.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/33.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/33.manifest new file mode 100644 index 0000000000000000000000000000000000000000..93573d62854dc650952ea181a15f783665d7dd6b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/33.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/330.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/330.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9c6769944963e1b8aca8ccf2b1cc05923c058bef Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/330.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/331.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/331.manifest new file mode 100644 index 0000000000000000000000000000000000000000..064a71e093d89e8468ae1b8edbcadf51ca518ca9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/331.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/332.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/332.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5f9757bcd3567355af86e5d44ac537657fa66db6 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/332.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/333.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/333.manifest new file mode 100644 index 0000000000000000000000000000000000000000..46496a672bf1d9118f3d8f4156896b7f0473ae05 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/333.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/334.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/334.manifest new file mode 100644 index 0000000000000000000000000000000000000000..10fdca7aaef3c1f7252f50bbafd02006f082eac3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/334.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/335.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/335.manifest new file mode 100644 index 0000000000000000000000000000000000000000..98d496ddb505f5f85faa2b37e78b4429b1d91888 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/335.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/336.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/336.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c403b1e452ee5d96af8f86fcc16f957503a2a136 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/336.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/337.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/337.manifest new file mode 100644 index 0000000000000000000000000000000000000000..502652157a0188b871dde0a8f60f42a9ecec83dd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/337.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/338.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/338.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8f1d00d1e2068852fb3dfc67858189c8947fbd6e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/338.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/339.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/339.manifest new file mode 100644 index 0000000000000000000000000000000000000000..649a8aa112baa25fc3ddb7c2a1a485ae712449bc Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/339.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/34.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/34.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3e5700c76fe3a2b1a2ce840a889b1d158e63274b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/34.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/340.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/340.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b7e62bf5f095f2e00cd48eb5d959976a3b69af80 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/340.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/341.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/341.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fd19ba8f39a88f09e695fe752ef66b5aeb166a56 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/341.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/342.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/342.manifest new file mode 100644 index 0000000000000000000000000000000000000000..452357210efb54a604d13c8aeec5bb7edae3eab1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/342.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/343.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/343.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c25f867636c9723e68ac7c40d841a69ed1b4ec69 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/343.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/344.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/344.manifest new file mode 100644 index 0000000000000000000000000000000000000000..02eac569f282256de24c854cdd75873c1aa2c4a2 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/344.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/345.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/345.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b1452f04eeb60ce6e0e628e3c818238db2f0ab28 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/345.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/346.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/346.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6fe7ca9157faee0187469b29c8713d3dbd93d84d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/346.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/347.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/347.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f17478fc7839c64f4f0a12a933148861c2a91de6 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/347.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/348.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/348.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a6d5c70a312429b94ef68e9221584c8af952b16a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/348.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/349.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/349.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ede63ca145a6f04ba8603e8963684fefe1a5ce5e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/349.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/35.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/35.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f5771b7610da090454bdbd7a23ff64f4426c083d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/35.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/350.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/350.manifest new file mode 100644 index 0000000000000000000000000000000000000000..545e3143d720a6be0adaae8fad4437050e1c000b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/350.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/351.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/351.manifest new file mode 100644 index 0000000000000000000000000000000000000000..defa74ca661f9df3eb3e216bca8523c93b5d996b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/351.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/352.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/352.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c9451a3752799a20d640b894c1aaa47a0c8b2454 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/352.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/353.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/353.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9079d8c8650840c24284a3cad015496b8464d9c7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/353.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/354.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/354.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c6a21b517cffcb1c01dabb759c531e3a8f7381be Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/354.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/355.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/355.manifest new file mode 100644 index 0000000000000000000000000000000000000000..32c039d2751de0a74e62f381c94f3f89cda3a389 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/355.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/356.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/356.manifest new file mode 100644 index 0000000000000000000000000000000000000000..29bcf6dbd03c13526f0b7b423bbd968dabe6c0bb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/356.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/357.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/357.manifest new file mode 100644 index 0000000000000000000000000000000000000000..99f313b08dc075efb98618ebdf7baa088ac012a1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/357.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/358.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/358.manifest new file mode 100644 index 0000000000000000000000000000000000000000..741559081dbf3dfa3fcf1c65ea63e5d7909dcbfd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/358.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/359.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/359.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b92e663bff53cdb0afddc6e6fbee71473b22edb1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/359.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/36.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/36.manifest new file mode 100644 index 0000000000000000000000000000000000000000..27fb66c2dac7e6ef89b31da28dc4b8af56bdde36 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/36.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/360.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/360.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6a0ca11cbe7fc6f8a666c1232c9fdc8f69b6a88f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/360.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/361.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/361.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9a4d0e22e224f71c7a4616fd1b60ecc676ffaee3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/361.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/362.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/362.manifest new file mode 100644 index 0000000000000000000000000000000000000000..44b1abeb3ddfdca6b0d53ffcbd56c86d0de27928 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/362.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/363.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/363.manifest new file mode 100644 index 0000000000000000000000000000000000000000..15d404e532474050014af40dd5d7e72d64e09292 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/363.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/364.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/364.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9e489c5de0abadcb0a2c56ef01895317464d05bc Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/364.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/365.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/365.manifest new file mode 100644 index 0000000000000000000000000000000000000000..db7907e843d1555dd3989d67247051e56ec15ff4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/365.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/366.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/366.manifest new file mode 100644 index 0000000000000000000000000000000000000000..080f358f8fb0cca130fc6bded17abee9b3839488 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/366.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/367.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/367.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a3a6329bbe7bae082b4b8d53fbe453938e1048ab Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/367.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/368.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/368.manifest new file mode 100644 index 0000000000000000000000000000000000000000..96c843d2d5bc8c6c79b285271a3fdd73c31d62ea Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/368.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/369.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/369.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b633210f95289320d117e838f74bca3c03501b9a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/369.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/37.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/37.manifest new file mode 100644 index 0000000000000000000000000000000000000000..57c4a8b5173e0d92ff6964fe4f0c816ead67cca7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/37.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/370.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/370.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2d96355a54ac6b0193ea43d94811fcb154718070 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/370.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/371.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/371.manifest new file mode 100644 index 0000000000000000000000000000000000000000..009c298a196d7cb3149090078bf658beb14f159b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/371.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/372.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/372.manifest new file mode 100644 index 0000000000000000000000000000000000000000..959ba97dafdb6880887c6271f811984c8ca54819 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/372.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/373.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/373.manifest new file mode 100644 index 0000000000000000000000000000000000000000..41147da1caa2329508243103c534f58a91afb6f3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/373.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/374.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/374.manifest new file mode 100644 index 0000000000000000000000000000000000000000..845cb6705ade323676cabeab6fe3f4d4422f4a2d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/374.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/375.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/375.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2ab914d1f566b310072dc1dc28862b60ceb2e97e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/375.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/376.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/376.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f3cc3a90cf1fb2f5c66415bee5fbba2c6799b6e8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/376.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/377.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/377.manifest new file mode 100644 index 0000000000000000000000000000000000000000..25e556678e3d3fea8fc0de7bab66a1c7ee7d2240 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/377.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/378.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/378.manifest new file mode 100644 index 0000000000000000000000000000000000000000..16673de4bdcbe4a731fbbcf43d6f9f77084d7053 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/378.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/379.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/379.manifest new file mode 100644 index 0000000000000000000000000000000000000000..82dff15b53af12e05af372c1c2b3f10312f7cc79 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/379.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/38.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/38.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2166801f4ea14aabd216d5d2d442e0da71817aae Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/38.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/380.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/380.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8710cc7e22033e8e2aacfd1eea582dc3b0653ff0 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/380.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/381.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/381.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b2c5a7273a7f3808157ea18a435c2cc300fe0fe7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/381.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/382.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/382.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bb34c6be1c4d3f983a31e363a83d426c4a889f72 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/382.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/383.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/383.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8e8d3e6979cd8681b2accddd4245480edd784355 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/383.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/384.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/384.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2bb1bc7455e607b88ea764ee3933a993018c026b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/384.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/385.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/385.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8d46d06475c8ab04eb4d3dbc852075b592d0bc4d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/385.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/386.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/386.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0cd745bd2bfaab333972e775696cfe99977b5734 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/386.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/387.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/387.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9d48d8c70c7bf8bed73777c12e91fb614c134f18 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/387.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/388.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/388.manifest new file mode 100644 index 0000000000000000000000000000000000000000..02e47b4b96d64bf15c9d539232334b9521a288c3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/388.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/389.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/389.manifest new file mode 100644 index 0000000000000000000000000000000000000000..551b91d52050eb6dc04356282e19d21d348e08fc Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/389.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/39.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/39.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ee425130ad87b9821a0d179c339682a47efa53b1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/39.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/390.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/390.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3b3585dda14f0a27595c9adf9d81c8f5319b97b9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/390.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/391.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/391.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f3845c756689466148c56a035e5be90fc1f3e502 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/391.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/392.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/392.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f8c511ecc8305ea192b9f737f6dc1bcbc505b593 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/392.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/393.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/393.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bae2b2e9483fba7e3788d8ca42481fb0e7f51d70 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/393.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/394.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/394.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3f3c637f96fda01dc7bb098950037363efcc78e9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/394.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/395.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/395.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c9f969187903ffd838d342e2e36a02b9a2b9461c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/395.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/4.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/4.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b09f27ec48d74182b6c1e02aeff5d62f13e6d8b4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/4.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/40.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/40.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0d6a9134268af79109fec2f5a837fecc5a47a50a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/40.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/41.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/41.manifest new file mode 100644 index 0000000000000000000000000000000000000000..857399a5985c73784c6ea81bc5380d919c68c645 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/41.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/42.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/42.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3f214f71193236d22ca445b2ff0c575798730ebf Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/42.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/43.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/43.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7e58e81730db2a6448c17a5a78a0794906950f0d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/43.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/44.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/44.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c681f7f0178a2339e329f10e693abfa23a422220 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/44.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/45.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/45.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9faf340b2bc37e0f65b709948c235de5d82d9224 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/45.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/46.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/46.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8c337fbf223d3a8f440303d350dda8928ae79391 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/46.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/47.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/47.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1c3d209e5de13966f43b6de2d4e7e8fe4fc0df7a Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/47.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/48.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/48.manifest new file mode 100644 index 0000000000000000000000000000000000000000..02081ac6dc7b2274a09d300703fc399016e03bee Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/48.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/49.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/49.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0b8ff8fc0373da2c833b56a65de34f1c9d83d95c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/49.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/5.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/5.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f21b97d93343ca12f31186017b6268deb8a16dcd Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/5.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/50.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/50.manifest new file mode 100644 index 0000000000000000000000000000000000000000..57a1a09638f353c73066a38585d25e315bff5ea4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/50.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/51.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/51.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7b964e23e75b9872be57b7d87e25e91bbba38147 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/51.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/52.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/52.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9872e39cd64feda6bd755e95782cf09475a3fc9b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/52.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/53.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/53.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b2be2290fc8ad87c1e8e37a1e4076468e3d66641 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/53.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/54.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/54.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a823521bf14d549dbf72155315f9e1ad01d9a6e8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/54.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/55.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/55.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d587135a076eeec6498bfc329f21c57c4709fa14 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/55.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/56.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/56.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a937baf80027ebcbaafc8933b42c06562003475e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/56.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/57.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/57.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d44fdcceb4673109d28dae067135abddb9af4d1d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/57.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/58.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/58.manifest new file mode 100644 index 0000000000000000000000000000000000000000..af44a67fcd47ba8bf7df274208397009c74d44ae Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/58.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/59.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/59.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b902a5eb155b9eef1a64ca39ce9c9699ef8597e9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/59.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/6.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/6.manifest new file mode 100644 index 0000000000000000000000000000000000000000..47c44e486734f7edaad28e31f84ffdcf6a50ff80 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/6.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/60.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/60.manifest new file mode 100644 index 0000000000000000000000000000000000000000..92935540a834571c25c3515cf5939a8a7edf1c43 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/60.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/61.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/61.manifest new file mode 100644 index 0000000000000000000000000000000000000000..52a382de5013e6419fa971f39455474a4fc9c9bb Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/61.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/62.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/62.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6d88c4fedc86a5225109ff34ab95ab2ce31848d9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/62.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/63.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/63.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7480ed1bb16500cbb37cae25d09798f35528d5d6 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/63.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/64.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/64.manifest new file mode 100644 index 0000000000000000000000000000000000000000..19426343073119ce2e9fb7e211b36bfebdd14168 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/64.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/65.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/65.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dee17a741c529e353c7be05a6d8fc6dc299f85ae Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/65.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/66.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/66.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f508cc3af84f4b0545f7493ee607a071905f5220 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/66.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/67.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/67.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2a7d9aacb5a943288c0ea29674ae01e295783181 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/67.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/68.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/68.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fff2aa337d9ac471740f1bbd891b38c3daa4ddf7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/68.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/69.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/69.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a0463d4b32f14f632c024358f3a577e976d62fcc Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/69.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/7.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/7.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ec9ce4e1c961ef261d7eddd1e567427ef0014ed9 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/7.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/70.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/70.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1eaa4362022b7df1c24bd5bc8d8a0051fee1e753 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/70.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/71.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/71.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6361b2056543d655ad13ff0174c6ed625d13026c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/71.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/72.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/72.manifest new file mode 100644 index 0000000000000000000000000000000000000000..420db9627eb08cf331ed7393a819755b1cce094b Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/72.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/73.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/73.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3a7f97f78ed1b860c1be659ba0cf065c0cab9aac Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/73.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/74.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/74.manifest new file mode 100644 index 0000000000000000000000000000000000000000..900d539191b39a197e96a4b628565d6c0ac836b4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/74.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/75.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/75.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2b54fd5c555e0c2e2610192632d51e2803fb6fb7 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/75.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/76.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/76.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cb5222bd5fba55e9421fe1797387c1a1726e250f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/76.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/77.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/77.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f48556cb7ab9c5f008d58b31bda518cb897eb45e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/77.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/78.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/78.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b9cb529a1ce52d771d8033717f07aeb02eaf7b6c Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/78.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/79.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/79.manifest new file mode 100644 index 0000000000000000000000000000000000000000..05830af1e8bbd3188b1210e0d927ba1b50a2d54d Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/79.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/8.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/8.manifest new file mode 100644 index 0000000000000000000000000000000000000000..36dd2274914b00560fdec34faec900edacfc5c52 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/8.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/80.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/80.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f7fbd0806965a6d248db622a31187ad5b427c629 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/80.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/81.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/81.manifest new file mode 100644 index 0000000000000000000000000000000000000000..eb5349aa74274dc08285184e224f9e686cc8bc79 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/81.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/82.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/82.manifest new file mode 100644 index 0000000000000000000000000000000000000000..53410544537873b9ee3d87f2054683761c3febbf Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/82.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/83.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/83.manifest new file mode 100644 index 0000000000000000000000000000000000000000..761bd1cdc656cfa3a1044b92fd05eeb0b83bac4e Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/83.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/84.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/84.manifest new file mode 100644 index 0000000000000000000000000000000000000000..902ef4521a333b1251d11eac31c4a17cab13e326 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/84.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/85.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/85.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f3fa69007c054afd035ba9b2cf2d5741c184f1ec Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/85.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/86.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/86.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9ec6429877939c1cdd177d3111da85196a666585 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/86.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/87.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/87.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4793164903fd145abed103bb8eaae20a1b51e196 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/87.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/88.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/88.manifest new file mode 100644 index 0000000000000000000000000000000000000000..37045fad2c0586ff9ae0c5c92c0fb2ca6bb4760f Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/88.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/89.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/89.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ab46cb53d927598db86a260f721d6e227c94b8e3 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/89.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/9.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/9.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a4fc15e4942ed2228cd2cdf8e40fa36cc19eedbc Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/9.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/90.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/90.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e2169cd76038628fe0d1c12972ac81c136ad67d8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/90.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/91.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/91.manifest new file mode 100644 index 0000000000000000000000000000000000000000..33cb508c22b5994781501862e082ca6d0b291723 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/91.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/92.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/92.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0c2d481a12b2f8b3672a28cbe5350050144841d4 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/92.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/93.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/93.manifest new file mode 100644 index 0000000000000000000000000000000000000000..62a09315a4e5c391e7d406ceed2db7d95f02e3a1 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/93.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/94.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/94.manifest new file mode 100644 index 0000000000000000000000000000000000000000..31396e7417e32bef37be55206ed38deaa7ff7249 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/94.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/95.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/95.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3bcabef4aa8381725bb22b35ffd757eeeaa796ee Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/95.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/96.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/96.manifest new file mode 100644 index 0000000000000000000000000000000000000000..29aba0a72bc6aef6a736d0f45e8f0b8767e35728 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/96.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/97.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/97.manifest new file mode 100644 index 0000000000000000000000000000000000000000..08e5d88d260a60499355228f1c3c2f33e4c8f597 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/97.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/98.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/98.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ff92ae0e408a08304166981fba2da2575fe900c5 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/98.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/_versions/99.manifest b/.lancedb/content_aware_chunking_BAAI.lance/_versions/99.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9a78cbc768b8f18d80e434dab1c08165bfdf5bf8 Binary files /dev/null and b/.lancedb/content_aware_chunking_BAAI.lance/_versions/99.manifest differ diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/0096631d-31a0-4d0c-9d15-063269d37fbd.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/0096631d-31a0-4d0c-9d15-063269d37fbd.lance new file mode 100644 index 0000000000000000000000000000000000000000..5a11d655b3f539993f1a0fc65fb64adeaf5da88b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/0096631d-31a0-4d0c-9d15-063269d37fbd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eff5e99131c5e7605a840192a7ad1c67078ac90d2050611cdc8cfbb17a4bc9d4 +size 144669 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/015b5de5-2644-4f44-95dd-e0579bd3b659.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/015b5de5-2644-4f44-95dd-e0579bd3b659.lance new file mode 100644 index 0000000000000000000000000000000000000000..ca3a88f54d588bc4cf044fa0548a14cc471b4a3e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/015b5de5-2644-4f44-95dd-e0579bd3b659.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8ead68b610d7ae8aa890d7987455bf083311a0c673276faf9365fd83c504b5f +size 140472 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/025a6905-e5ab-4923-89ff-31b9d0a29100.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/025a6905-e5ab-4923-89ff-31b9d0a29100.lance new file mode 100644 index 0000000000000000000000000000000000000000..139042c4d09d44cae284962e544b6f13f495fca3 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/025a6905-e5ab-4923-89ff-31b9d0a29100.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cec14682d994d019e8b3d53e2f5d26d5fe749baf9e655b598898b68b3595cd56 +size 142866 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/029dc0e7-3d6a-411b-b99c-46d332528ab6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/029dc0e7-3d6a-411b-b99c-46d332528ab6.lance new file mode 100644 index 0000000000000000000000000000000000000000..424a43e5eeb2c2b2ac12cee84be117b5e0220c2a --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/029dc0e7-3d6a-411b-b99c-46d332528ab6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e54b52b33bca3d5bf39d438cda68a6c305f2e3821e54e1d87ea86d699ede2e66 +size 131319 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/02ed27d3-12b5-4017-8ca6-6aede5c60db6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/02ed27d3-12b5-4017-8ca6-6aede5c60db6.lance new file mode 100644 index 0000000000000000000000000000000000000000..94373664ba854e77d74e0be6d58f7769a2f0b9b5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/02ed27d3-12b5-4017-8ca6-6aede5c60db6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f9dcb17a8addff529f18dc8813bd957ea78eaab6bf5244ce33a421989ac556c +size 136192 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/038d22fe-696b-4ba6-a598-a384ee0fe0f5.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/038d22fe-696b-4ba6-a598-a384ee0fe0f5.lance new file mode 100644 index 0000000000000000000000000000000000000000..89a4a5aa9ca29608fc8ca81cb8b5c7df8551eed2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/038d22fe-696b-4ba6-a598-a384ee0fe0f5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78f6b2456098c652be8c298aa57ef921d6ce8865ac0fa0e68c65a77837501883 +size 140199 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/04bdd218-765d-49ba-ad8a-c25b2e912b22.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/04bdd218-765d-49ba-ad8a-c25b2e912b22.lance new file mode 100644 index 0000000000000000000000000000000000000000..4d4163c28ca0eaf247b5bae185db569e3bd859ca --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/04bdd218-765d-49ba-ad8a-c25b2e912b22.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0292e0fe929514f48be2f54de27282d38c4032521b665a93341ed7b4b978eb87 +size 137608 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/061c2414-9a17-4958-bdf8-6252968f9562.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/061c2414-9a17-4958-bdf8-6252968f9562.lance new file mode 100644 index 0000000000000000000000000000000000000000..f05309223d6b1ea858714049f8783957910cb0d2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/061c2414-9a17-4958-bdf8-6252968f9562.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9442e6888fc024283bfa81c13adc00217c4ff6d6577064dcd785479981f63283 +size 144411 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/063c4b1d-e8c4-45a5-9242-8778ccdef676.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/063c4b1d-e8c4-45a5-9242-8778ccdef676.lance new file mode 100644 index 0000000000000000000000000000000000000000..dfc7a0d2eb165cbbf77f8cc7164daac672198702 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/063c4b1d-e8c4-45a5-9242-8778ccdef676.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cca416debc933e0acb76200933a8754ab7a861013ef1d110bec5171e0c6d5182 +size 139331 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/07e52c33-5697-4877-93de-09a46faea9b9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/07e52c33-5697-4877-93de-09a46faea9b9.lance new file mode 100644 index 0000000000000000000000000000000000000000..f11151d6b504f9e4cdbfaa05e31e4259ab653d60 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/07e52c33-5697-4877-93de-09a46faea9b9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37d19cd13a7e1c208d383c77d6a52e646af01f5bd1cdaa6057cc679d22af33ec +size 135533 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/0809d22e-2194-4739-b099-0c57ea164c17.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/0809d22e-2194-4739-b099-0c57ea164c17.lance new file mode 100644 index 0000000000000000000000000000000000000000..ff2e673fffa65ce27bcda06c27c0f7597f66091c --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/0809d22e-2194-4739-b099-0c57ea164c17.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94f84d5cb098c4a75770d8bca33fdc6068c8a8bc12d1222664bf2c0382fd4def +size 142907 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/08674ac1-2165-4d45-b7d4-6f936938c195.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/08674ac1-2165-4d45-b7d4-6f936938c195.lance new file mode 100644 index 0000000000000000000000000000000000000000..8bc8e59f647f4b99f672398e0fc6ec747e7cbe35 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/08674ac1-2165-4d45-b7d4-6f936938c195.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9ec2fdb3bec6e5e0c85556c3ca3ea3f32f13147f783fa2dfd39092571725e44 +size 140986 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/0987af35-5752-49ee-ae27-49df56e6dfb2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/0987af35-5752-49ee-ae27-49df56e6dfb2.lance new file mode 100644 index 0000000000000000000000000000000000000000..9ec951e83e75551ec600b8d7308983fef4e5ff08 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/0987af35-5752-49ee-ae27-49df56e6dfb2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:784d39fb52c777aa2e20b778dc8342c90ec56c43ea43114d20fd747d687ac299 +size 146314 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/0aeb87e9-ee3a-416f-95bc-24eb0e481158.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/0aeb87e9-ee3a-416f-95bc-24eb0e481158.lance new file mode 100644 index 0000000000000000000000000000000000000000..3658a80e277114418c81d381404b70ac2e97cb67 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/0aeb87e9-ee3a-416f-95bc-24eb0e481158.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:800e691754c8afbd78f7c864affec93bb0fbbdd992d23b783de3792a135c6b23 +size 139923 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/0b65eb3e-04e0-4746-b2f8-3abcb446d53d.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/0b65eb3e-04e0-4746-b2f8-3abcb446d53d.lance new file mode 100644 index 0000000000000000000000000000000000000000..c5d892f1d7c6c3ac237d81aa5d05495781ff32c3 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/0b65eb3e-04e0-4746-b2f8-3abcb446d53d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d71ebc625b5571ddb28c5a98e147a2c827d5962db914be6b59603de3107c3c8 +size 144876 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/0c74c663-018f-49db-a15f-6092615e3e77.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/0c74c663-018f-49db-a15f-6092615e3e77.lance new file mode 100644 index 0000000000000000000000000000000000000000..b83192c43ea9e0c58f410ad3c7b8d75a0ef722df --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/0c74c663-018f-49db-a15f-6092615e3e77.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3e1e085a61a557d2f6d6e672676073f33a1a7389e0ddc1f3a99747e69329f37 +size 141294 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/0d3119b4-ea20-4fee-b86e-89d018e649c7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/0d3119b4-ea20-4fee-b86e-89d018e649c7.lance new file mode 100644 index 0000000000000000000000000000000000000000..0c8c4ce68658dc0358785a241826a820ac311abe --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/0d3119b4-ea20-4fee-b86e-89d018e649c7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:213efe392e521bc17826f0bccaf0d69455aa3e2b693ddf844c3b8a0df36ee15e +size 139878 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/0f221984-4798-41ea-b609-0e768725fc28.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/0f221984-4798-41ea-b609-0e768725fc28.lance new file mode 100644 index 0000000000000000000000000000000000000000..942181dec317ff288ba66a9dd686e4ae00b4522b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/0f221984-4798-41ea-b609-0e768725fc28.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d806a4c88d8412aaea976dd189d913bc28b8eced14e5f12fae63e82f6ffe5b1f +size 136545 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/0f2c92a4-7db7-4315-9035-2ada7af05f0c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/0f2c92a4-7db7-4315-9035-2ada7af05f0c.lance new file mode 100644 index 0000000000000000000000000000000000000000..cf13ae33ab3205957d3097dd938c2ed2d83ca3ff --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/0f2c92a4-7db7-4315-9035-2ada7af05f0c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:050319e2111d0eb1c3351e24a5c3cc6213114c406ce6077baebea4e86f17c539 +size 134588 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/0f629f49-2c15-4572-a4a4-2102c5dfe213.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/0f629f49-2c15-4572-a4a4-2102c5dfe213.lance new file mode 100644 index 0000000000000000000000000000000000000000..1f97230c196bf25ff76565916fe94344dadb764f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/0f629f49-2c15-4572-a4a4-2102c5dfe213.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:381f6bc9cd1e8bdfd5500a0fe51c0e824deb62dd79f03fba29308cd7c31e759a +size 145212 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1002246e-1ad6-45fc-9b52-d8d94ecc05f7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1002246e-1ad6-45fc-9b52-d8d94ecc05f7.lance new file mode 100644 index 0000000000000000000000000000000000000000..65b9a3cb65f387b803f5c2a190fce9a9b3a73840 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1002246e-1ad6-45fc-9b52-d8d94ecc05f7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b1aa8bbc4714c05cafc636cc0b8ccdbbe650e1e9a99e984a3b171e7ca81123f +size 138255 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/103eef83-4d3c-49f0-af0a-a17bb5c81924.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/103eef83-4d3c-49f0-af0a-a17bb5c81924.lance new file mode 100644 index 0000000000000000000000000000000000000000..ecb5d0d4f7506ede6fda2e83fbe4a22712580f6c --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/103eef83-4d3c-49f0-af0a-a17bb5c81924.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73711b7270b5ab40b0c257844fdbd86dae913c6c6aa89c4ea0f303d274eb1e6b +size 138974 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/109785da-4fae-447e-83dd-3788fc7fa444.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/109785da-4fae-447e-83dd-3788fc7fa444.lance new file mode 100644 index 0000000000000000000000000000000000000000..d2489c6017fa0ca0c99d58533d155d0396bdbcac --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/109785da-4fae-447e-83dd-3788fc7fa444.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10639beb5a76e4414ed6e455acba6602c8f04e8d39946c1eed503468762134a4 +size 148085 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/10ac1533-7e5f-40af-a6e4-97a36e44bc9e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/10ac1533-7e5f-40af-a6e4-97a36e44bc9e.lance new file mode 100644 index 0000000000000000000000000000000000000000..5c68705c8d31c9fa34c3162795b708d79c4bbc26 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/10ac1533-7e5f-40af-a6e4-97a36e44bc9e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c684dbdef40022eb1347eece7ba92559d565928a0442714fa95ba12af0dd35b0 +size 146165 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/10d56159-5092-4853-8dcb-50f701db07f5.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/10d56159-5092-4853-8dcb-50f701db07f5.lance new file mode 100644 index 0000000000000000000000000000000000000000..89170cca281178d7032aa91bbf87d2e0e780732d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/10d56159-5092-4853-8dcb-50f701db07f5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10b815f31ba8dd1dec730dc17ed1ad412326e263b343ce9fc3b2d28fd4e8a00f +size 139981 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1213b8fb-4af4-4e8b-8e9e-c043316d8548.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1213b8fb-4af4-4e8b-8e9e-c043316d8548.lance new file mode 100644 index 0000000000000000000000000000000000000000..f3639b71cd2d5766039e94ae0d2d6ba5025ef7b6 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1213b8fb-4af4-4e8b-8e9e-c043316d8548.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec211f5556a2f040c10e6d14c58b54513bab66635dadfc54e55297ecc8cc36a4 +size 149518 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1372a804-fa4b-4ac8-ab71-a19789e3bb27.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1372a804-fa4b-4ac8-ab71-a19789e3bb27.lance new file mode 100644 index 0000000000000000000000000000000000000000..727e4a2c328bbb2b776418aaddb2e2ce9d18083d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1372a804-fa4b-4ac8-ab71-a19789e3bb27.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2f6c78601b91d6da8cbe1d220c21d628b909a6144728cfba6e19dbc15ec503b +size 137378 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/139f688f-b71d-41e4-b417-020e712394ac.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/139f688f-b71d-41e4-b417-020e712394ac.lance new file mode 100644 index 0000000000000000000000000000000000000000..dfe2da0e360448839d2f413c27d45d50a0edd92d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/139f688f-b71d-41e4-b417-020e712394ac.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d24709d6d4a40ad6d0176c90f77227fa09fd5e20fe520a8efb4ae1dafb61fd5d +size 140105 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/14372d1d-95c2-48fc-bfab-9d092c3153f2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/14372d1d-95c2-48fc-bfab-9d092c3153f2.lance new file mode 100644 index 0000000000000000000000000000000000000000..0ed2cacacd42655f2201f3dc30bc38ccb377286b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/14372d1d-95c2-48fc-bfab-9d092c3153f2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e66e3a589b19b45b6e472fed3da782b06a50a67e5fe3fb451b3dd1862f4a9fb +size 140862 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1437d78f-d505-44e5-85fe-7c931ce91677.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1437d78f-d505-44e5-85fe-7c931ce91677.lance new file mode 100644 index 0000000000000000000000000000000000000000..c175fd12a107b0e64413041eecd329bc19fa7d6d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1437d78f-d505-44e5-85fe-7c931ce91677.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3701ad72dca941bfde203679666f87aca4a2018cb1b1d4a3a7052e570c53f8f5 +size 144752 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/15c2f5de-b533-41a8-b1da-bb42a27e2053.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/15c2f5de-b533-41a8-b1da-bb42a27e2053.lance new file mode 100644 index 0000000000000000000000000000000000000000..86d09f1708abcbe7cf35a793d69f9ed26d193ce3 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/15c2f5de-b533-41a8-b1da-bb42a27e2053.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:524d6cfe0592f2cf4ab656685c51061d817eee55c4778e66ed8093f9a98adeac +size 147109 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/15f3c8a8-7084-4e8f-ae64-6c811cf531c6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/15f3c8a8-7084-4e8f-ae64-6c811cf531c6.lance new file mode 100644 index 0000000000000000000000000000000000000000..9b8284113a5a7df54991cf40246428b0fa69a959 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/15f3c8a8-7084-4e8f-ae64-6c811cf531c6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d3cf0e1d912098c1efe054168b9e3798224d33ee930048cadce435f25f62fed +size 139752 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/164b9f47-f449-4598-9143-65db7e09e8aa.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/164b9f47-f449-4598-9143-65db7e09e8aa.lance new file mode 100644 index 0000000000000000000000000000000000000000..3c16f467b6c6e185a359d00e6523004e417fd015 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/164b9f47-f449-4598-9143-65db7e09e8aa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71bd4d4cd4262acadbaff579e0551417ac99c1c1c3f629190ee40595a32dd9b9 +size 144653 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1675ca3b-98a2-41aa-a68d-1b3e26e85cb9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1675ca3b-98a2-41aa-a68d-1b3e26e85cb9.lance new file mode 100644 index 0000000000000000000000000000000000000000..1d918e668932d508b150c735dd5d6f1ccf6d04f9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1675ca3b-98a2-41aa-a68d-1b3e26e85cb9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d753ef2c08a8e61db4ecb3975ac72faca65fd40b16521cd9cbc5fb8bd1a4756 +size 152477 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/169f2562-af4c-4ae5-ad38-b011532009a7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/169f2562-af4c-4ae5-ad38-b011532009a7.lance new file mode 100644 index 0000000000000000000000000000000000000000..c230a2864b986b60d51460b545d3d320b462cdcc --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/169f2562-af4c-4ae5-ad38-b011532009a7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29ea1be17288992957470b98308df7d59c6b59f3e445ba0e089e66116afb1500 +size 140250 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/16bacc63-6d61-4e7e-91c3-185035a2731c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/16bacc63-6d61-4e7e-91c3-185035a2731c.lance new file mode 100644 index 0000000000000000000000000000000000000000..9ea3022f405b5d5069a6b63bf4b574e4e8aba00b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/16bacc63-6d61-4e7e-91c3-185035a2731c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78512af390a12ec24d0990b21836377cc99f346f0193e49ac1f9d6a585e53341 +size 139305 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/16c4e903-207c-4f26-90b2-098b7c866ad3.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/16c4e903-207c-4f26-90b2-098b7c866ad3.lance new file mode 100644 index 0000000000000000000000000000000000000000..d5f19b679c552fb757358f2a6344304d0598646f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/16c4e903-207c-4f26-90b2-098b7c866ad3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3cb60ad542a0ccecdbe1b74cf993bb7495554e2d1703271fe3b50756ed6402f +size 134605 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/17c14f96-8d68-40e4-9b36-1b9e85bd4342.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/17c14f96-8d68-40e4-9b36-1b9e85bd4342.lance new file mode 100644 index 0000000000000000000000000000000000000000..83365291faa361d3d93eb210aaa1ee0300842fa6 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/17c14f96-8d68-40e4-9b36-1b9e85bd4342.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7de16a345e8180771e883e52c8a6768ce9bbd9d4b6a6cf05d6043b3c58715c11 +size 140394 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/17cd5821-a304-4a9a-93c1-be7652b3c20e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/17cd5821-a304-4a9a-93c1-be7652b3c20e.lance new file mode 100644 index 0000000000000000000000000000000000000000..9bbbd45c2d0412ee704a33a8f6df561c80dccbc5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/17cd5821-a304-4a9a-93c1-be7652b3c20e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc3c7293b20d073993c04bff1cb378b763e005e7be01e55868c76e032cbc746c +size 138821 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/182c81ed-d21c-47c3-93e4-7892a065c9d1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/182c81ed-d21c-47c3-93e4-7892a065c9d1.lance new file mode 100644 index 0000000000000000000000000000000000000000..dba7b9b2a935f4786c0acd734f8952fc8ad4a5df --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/182c81ed-d21c-47c3-93e4-7892a065c9d1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3674287288b24ad30a35f95b5dd9193e284f02814b13786f74fd8e81f7187a41 +size 140629 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1881d1a7-c1f9-48cc-83ad-c67d6b66de38.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1881d1a7-c1f9-48cc-83ad-c67d6b66de38.lance new file mode 100644 index 0000000000000000000000000000000000000000..f0642c357efec3b23995afb9e533f1a59c909026 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1881d1a7-c1f9-48cc-83ad-c67d6b66de38.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2ad06556f49f64883cb439e325d38b110f5df9cf54c34fe731342082b4a0ee0 +size 134622 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/18dcc893-3236-4893-a7f6-50ab5edca9c5.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/18dcc893-3236-4893-a7f6-50ab5edca9c5.lance new file mode 100644 index 0000000000000000000000000000000000000000..386f1f7d756637c2eb54162aefe42e33e3e586bb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/18dcc893-3236-4893-a7f6-50ab5edca9c5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3add134f9912bb4058985836f673f6081b93e4d405b3360a90ea2b0e9e62f1b9 +size 139382 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/19f5b493-ef87-4e51-a870-a38790d6858c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/19f5b493-ef87-4e51-a870-a38790d6858c.lance new file mode 100644 index 0000000000000000000000000000000000000000..846850bab3f1810039b51d20c92171d94e8ad2a7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/19f5b493-ef87-4e51-a870-a38790d6858c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3ff435e5f478c5de34ddef89570cd05d6d1c94bd2ece96f358c1bd0c68c381a +size 143233 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1a2e9354-a18b-4ef5-a046-9a56e0e465a5.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1a2e9354-a18b-4ef5-a046-9a56e0e465a5.lance new file mode 100644 index 0000000000000000000000000000000000000000..4e1c18ac0023a7353f8918a20fbb228031e5503d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1a2e9354-a18b-4ef5-a046-9a56e0e465a5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b073a329e42e7a6d5dca2fa33e14818ff22eceed7771dec73f06eb72c958d8f8 +size 141167 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1a4051bb-9ae3-4438-9f45-bcc008ad1fa0.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1a4051bb-9ae3-4438-9f45-bcc008ad1fa0.lance new file mode 100644 index 0000000000000000000000000000000000000000..436156e01e58a3c757b21ca51bda936f71180191 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1a4051bb-9ae3-4438-9f45-bcc008ad1fa0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa1c69b894fe7d233e8a27190ca332e716041f3d474554ee7957220306eef11b +size 136663 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1b0bcd65-1839-414c-a45b-f928b11cc94a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1b0bcd65-1839-414c-a45b-f928b11cc94a.lance new file mode 100644 index 0000000000000000000000000000000000000000..55f80952e1a231bb723d5aa6e5b09fd767ad2ce1 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1b0bcd65-1839-414c-a45b-f928b11cc94a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e14f068918df2dc7d5f5efa1267606e5169c4020df9154c14123485cc6fe5dc +size 151385 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1b1d453e-f610-4b97-9460-13eb0ada06a1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1b1d453e-f610-4b97-9460-13eb0ada06a1.lance new file mode 100644 index 0000000000000000000000000000000000000000..dfbfb2a7304d2c38525f04c9e8860400d69bcfc3 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1b1d453e-f610-4b97-9460-13eb0ada06a1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a11fe8b11f57678629f0b23b83033b425c6acc8f36c39cd92d7be9d8e6e3beb +size 140016 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1b461764-2719-419e-afe2-12bf750316cd.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1b461764-2719-419e-afe2-12bf750316cd.lance new file mode 100644 index 0000000000000000000000000000000000000000..7e8996b4b126d40d8225981cc5ea78035d0dc91b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1b461764-2719-419e-afe2-12bf750316cd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d519cc44a1a8495a5de17d8718347a9dd160058b1e7469f5e6f8690f4f7b1ba +size 139245 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1b60bf1a-7a7f-44f1-afe0-2962ecfd7056.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1b60bf1a-7a7f-44f1-afe0-2962ecfd7056.lance new file mode 100644 index 0000000000000000000000000000000000000000..37e8064cfa3c5294dfa0b2378adb882f9e9927a2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1b60bf1a-7a7f-44f1-afe0-2962ecfd7056.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6caeded135245eddb1c130df5adf2f2bc3e2018401422c83d374c66df256ee19 +size 137222 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1c9254f7-94fb-4d5e-8cb2-bbb3f9a2d538.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1c9254f7-94fb-4d5e-8cb2-bbb3f9a2d538.lance new file mode 100644 index 0000000000000000000000000000000000000000..f48322919f4fdca6073e707396147091b81ca852 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1c9254f7-94fb-4d5e-8cb2-bbb3f9a2d538.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9be977b5a2006cffce31ac7464fcc7f1ba27f5658bed5db9033093cd94b5ca10 +size 143320 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1cd89ce0-cac3-44de-8112-97b0a1628ee2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1cd89ce0-cac3-44de-8112-97b0a1628ee2.lance new file mode 100644 index 0000000000000000000000000000000000000000..aad50ccecc20579e7c88d1cf2c1d3d7a4b0bfca1 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1cd89ce0-cac3-44de-8112-97b0a1628ee2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f19212856eea032881ead3309a532987a11fe373e551a164e79339c079e834f0 +size 138333 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1d6b0962-5289-4709-b56d-cc18bd8754c4.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1d6b0962-5289-4709-b56d-cc18bd8754c4.lance new file mode 100644 index 0000000000000000000000000000000000000000..8e4b8ee089fa10147c05dc3dbcfd9838a9d5b6a3 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1d6b0962-5289-4709-b56d-cc18bd8754c4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36480489ebf641d8f70aac80c5ce9336b9879bdd145fc4c0f9d172717aca8aa +size 138076 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1e00d9d9-eb47-40f9-b562-2b2edd4e8ccd.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1e00d9d9-eb47-40f9-b562-2b2edd4e8ccd.lance new file mode 100644 index 0000000000000000000000000000000000000000..8c3e8a97ae1a0dc18b743eca826fd3d87187dc52 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1e00d9d9-eb47-40f9-b562-2b2edd4e8ccd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac59ee0f898fc5e30996238206973c217c77dd0ddaab6604fce3a4f117b61dbb +size 145120 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1f909e16-adbc-40f0-b01f-585787c0f120.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1f909e16-adbc-40f0-b01f-585787c0f120.lance new file mode 100644 index 0000000000000000000000000000000000000000..e42ef9cc989bbf33cb3dee1dbaea5f31769524f1 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1f909e16-adbc-40f0-b01f-585787c0f120.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce38d591feba77fbcd7964657ea4b3cd39a154be40f9c986fca3a1d2b9a73ab5 +size 141864 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1f9e8b06-79bc-493b-9d3e-1ea3ccfbca37.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1f9e8b06-79bc-493b-9d3e-1ea3ccfbca37.lance new file mode 100644 index 0000000000000000000000000000000000000000..8fd0cc4fddf733b4f33b6fac77a7b98e1f9e1c82 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1f9e8b06-79bc-493b-9d3e-1ea3ccfbca37.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cecc065084c2c61cdb128882301218ccfbcc6317c8a0831b916f16f16d256473 +size 146742 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1fab8f0c-a5a7-4ccb-976a-d530d027b3d6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1fab8f0c-a5a7-4ccb-976a-d530d027b3d6.lance new file mode 100644 index 0000000000000000000000000000000000000000..df4c50c9168679828a81d344dbbe4b9d215b649e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1fab8f0c-a5a7-4ccb-976a-d530d027b3d6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96762c7ca4958ed6ba4cf5f2862c358f48c7177ad819b804af7ed3b147689dc4 +size 140461 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/1fc0d9e3-3662-4145-a99a-16c23563cee1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/1fc0d9e3-3662-4145-a99a-16c23563cee1.lance new file mode 100644 index 0000000000000000000000000000000000000000..4679f065e9af5c34906e69318e1fae4c711096bd --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/1fc0d9e3-3662-4145-a99a-16c23563cee1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be8c9207ff4a2902c49936ee9cbf7a6f2b3632fc9a7795306adab05174132c1 +size 149210 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/202bfd71-0044-4c6e-aacc-53aa44339335.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/202bfd71-0044-4c6e-aacc-53aa44339335.lance new file mode 100644 index 0000000000000000000000000000000000000000..81b282136a4c26c51b04f9cb55cbc1f566c41924 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/202bfd71-0044-4c6e-aacc-53aa44339335.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be57e96505fdc252539be42465036c99360545c7b233195546447b1cf6c7bd14 +size 140339 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/22122a92-811e-4040-8292-983413f47c8b.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/22122a92-811e-4040-8292-983413f47c8b.lance new file mode 100644 index 0000000000000000000000000000000000000000..454e7b02e568d5ec5347a87acecfe56992dd8cb8 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/22122a92-811e-4040-8292-983413f47c8b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e5972bad5f53abf8578153e067441c20144b8f0bd1f651d02636e551533b180 +size 147054 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/22230386-cc91-468b-bda6-bf602565a4f7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/22230386-cc91-468b-bda6-bf602565a4f7.lance new file mode 100644 index 0000000000000000000000000000000000000000..750e775bb62d4890a66d42a8e2285f5a393106cf --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/22230386-cc91-468b-bda6-bf602565a4f7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d233966358ce00afd680be0902cf5067bde98c2a3472a5ee6282c08fa903ad6e +size 141132 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/2299098c-b350-400a-a20c-019e1a626f70.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/2299098c-b350-400a-a20c-019e1a626f70.lance new file mode 100644 index 0000000000000000000000000000000000000000..02a7fa0c4acd649607dd3dd289526db509f351ed --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/2299098c-b350-400a-a20c-019e1a626f70.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e49b318ff95d2b0e38e22b2cf8e26a22ea49ce79d607231fcfb23adc44ee66b8 +size 134229 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/23d75175-3040-4c96-8ea6-5fd5e8bb5502.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/23d75175-3040-4c96-8ea6-5fd5e8bb5502.lance new file mode 100644 index 0000000000000000000000000000000000000000..857e9e3a3e022d7132c1a2f84ce6448ce2b60404 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/23d75175-3040-4c96-8ea6-5fd5e8bb5502.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc0c43404e7d9342208f1f6916e37133a00c3d0376cc02026f44f20e1978c270 +size 138590 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/24506697-68ba-4196-ab69-58e7d2fb89c2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/24506697-68ba-4196-ab69-58e7d2fb89c2.lance new file mode 100644 index 0000000000000000000000000000000000000000..fcc50cda5e0b0e1b4facbe74a953927a96cbf6a5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/24506697-68ba-4196-ab69-58e7d2fb89c2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51818bccc3921d38c4085d5d4c2fda53a504f3ca8a7baec53d53c9978147575c +size 142812 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/24bee4cc-5358-4f50-a757-5edbd0f7e1bb.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/24bee4cc-5358-4f50-a757-5edbd0f7e1bb.lance new file mode 100644 index 0000000000000000000000000000000000000000..ffe0015a501c52308eef43039ac2982d3aa22d51 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/24bee4cc-5358-4f50-a757-5edbd0f7e1bb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:014970e1766d41fcd140fae99f55b986b2c94a9c34ba27be6c6b68a4e63c03b0 +size 142033 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/24c25fa1-b75a-4dac-a111-85da342a23f6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/24c25fa1-b75a-4dac-a111-85da342a23f6.lance new file mode 100644 index 0000000000000000000000000000000000000000..c6c888de65cb373c2deba7839af14625300cd270 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/24c25fa1-b75a-4dac-a111-85da342a23f6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e75300e590f5aca1957f554471401e665e0aba1c6697dd2f85f7a18a9cbea04 +size 142438 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/24fbf3b0-2972-4185-9aed-4dffdf87b03b.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/24fbf3b0-2972-4185-9aed-4dffdf87b03b.lance new file mode 100644 index 0000000000000000000000000000000000000000..7879009d13bf38c13af8c25e5718a3562196b253 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/24fbf3b0-2972-4185-9aed-4dffdf87b03b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a732bfef9e4eb53a951eae1ceaf828d11938320c94ea3d0e1ea897efbe00c35e +size 135461 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/25558e8c-333f-462b-b703-581638ba3d46.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/25558e8c-333f-462b-b703-581638ba3d46.lance new file mode 100644 index 0000000000000000000000000000000000000000..7b3285788e02f810d8d53e7bae529c05c4ac497a --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/25558e8c-333f-462b-b703-581638ba3d46.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b47370684011bdd9cf1b75951203441833cbd3babfd483cb4766589384c2cbf +size 143317 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/26221433-d4b3-4e19-aa81-4c3c5a0475d9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/26221433-d4b3-4e19-aa81-4c3c5a0475d9.lance new file mode 100644 index 0000000000000000000000000000000000000000..a91b057429a63094ca1bda1b0721ac75997d9997 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/26221433-d4b3-4e19-aa81-4c3c5a0475d9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b628c536eba503b38ac69e1f51090671b2cb7d0734fd8b195a9ea19685ede3dc +size 139820 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/273fcdac-0e1e-4078-a948-36f90baa54f5.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/273fcdac-0e1e-4078-a948-36f90baa54f5.lance new file mode 100644 index 0000000000000000000000000000000000000000..142d64a0950c325983e8cd83b1f9f5466a3bbd58 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/273fcdac-0e1e-4078-a948-36f90baa54f5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27fdd8b42b2a3276763dc2899b22515b6ec10b602a1d7030cd58f3992c73e05d +size 141861 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/27841072-ec8b-4a34-bd88-8c517492f564.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/27841072-ec8b-4a34-bd88-8c517492f564.lance new file mode 100644 index 0000000000000000000000000000000000000000..789f6b61788e20b35b670e4b24e0ef0124f87472 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/27841072-ec8b-4a34-bd88-8c517492f564.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2edbc945367699f50271a2d1ee1fd4c1baf553accca07aa8231e384d6042658b +size 141854 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/27ed24f4-3776-4b75-b2a3-489958a09cdb.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/27ed24f4-3776-4b75-b2a3-489958a09cdb.lance new file mode 100644 index 0000000000000000000000000000000000000000..b486ca38d76957440fa1c50c34a7ef9534021947 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/27ed24f4-3776-4b75-b2a3-489958a09cdb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07e228dbeaf61a942b4b9fc2ba52c6e5fb31e69cd3ce97113324309a766612cc +size 139825 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/287656f9-69d1-4c3a-b496-41a921c7bf08.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/287656f9-69d1-4c3a-b496-41a921c7bf08.lance new file mode 100644 index 0000000000000000000000000000000000000000..88dac581a69c13ec0f337615f4b70689aa78de94 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/287656f9-69d1-4c3a-b496-41a921c7bf08.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:607bfea77871cc123ccba6a0a13675670cf66fab3723fa7a4cbe099a1d401dbe +size 140409 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/28959aec-a061-46e1-883e-4fdfbb236c9c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/28959aec-a061-46e1-883e-4fdfbb236c9c.lance new file mode 100644 index 0000000000000000000000000000000000000000..e3c1c6796078b50deacefe22727368a6e1b6e662 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/28959aec-a061-46e1-883e-4fdfbb236c9c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a4d030647f2191aeed68f3e58d0fb06c3d9808d0f42b39ec975cb2c485b502 +size 144185 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/294e426f-855b-48aa-b6fb-be7460d34297.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/294e426f-855b-48aa-b6fb-be7460d34297.lance new file mode 100644 index 0000000000000000000000000000000000000000..140eabc29b697d881ca38746c4c7c9920d9ad734 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/294e426f-855b-48aa-b6fb-be7460d34297.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b59574783cbd565e69a98a68b1b7523029a5d6b1c37f7f75362cf8f5bbe05617 +size 138263 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/2c1614e5-2e9a-408a-8206-8874683bcf56.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/2c1614e5-2e9a-408a-8206-8874683bcf56.lance new file mode 100644 index 0000000000000000000000000000000000000000..a1971ab2eed915acc8cf2efde55a79892a9303e9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/2c1614e5-2e9a-408a-8206-8874683bcf56.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a56afbdc5e21d2eb798041c15471a6004d4abe0f2096d2dda40d6e792732efbb +size 136094 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/2e30a605-b3d0-4c00-b724-8cfac96aa169.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/2e30a605-b3d0-4c00-b724-8cfac96aa169.lance new file mode 100644 index 0000000000000000000000000000000000000000..ff5a7fe7e2b0301141380f9f1f1cd682484e6e61 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/2e30a605-b3d0-4c00-b724-8cfac96aa169.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eefcef30a24b11888171f0cee4ea9ff8f401cb2bd708ba33805ce84bb33e7389 +size 140561 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/2e36c618-db2c-444d-abe7-ec5096d6ebfb.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/2e36c618-db2c-444d-abe7-ec5096d6ebfb.lance new file mode 100644 index 0000000000000000000000000000000000000000..69834bbf0816e9a77ac18d24be2b0fb5e7a8b236 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/2e36c618-db2c-444d-abe7-ec5096d6ebfb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f789019b05e023ee3055e07c38944d9c304147ccff4aa56c7adec5a59e25ac20 +size 134512 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/2e5b55d7-5bfc-4a40-8c72-d86243f65eb3.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/2e5b55d7-5bfc-4a40-8c72-d86243f65eb3.lance new file mode 100644 index 0000000000000000000000000000000000000000..beb438dd4275504bad64dfc6e1c505e8a248e71d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/2e5b55d7-5bfc-4a40-8c72-d86243f65eb3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac9754e10600c9d86cd3c44d20131aaa23a1e75b90f03581e28b62481458a852 +size 144435 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/2ed849d8-c27a-4a52-bddd-1ed668bf0b65.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/2ed849d8-c27a-4a52-bddd-1ed668bf0b65.lance new file mode 100644 index 0000000000000000000000000000000000000000..82610eac2e5a98c1fa6f7c9c871b212a627dd874 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/2ed849d8-c27a-4a52-bddd-1ed668bf0b65.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a617e255a54fb69bf0cdf74b116459aad3d6aa202c7e410c0020a9807a2a0869 +size 138761 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/3304ca1e-2044-4073-a324-d2399a632c79.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/3304ca1e-2044-4073-a324-d2399a632c79.lance new file mode 100644 index 0000000000000000000000000000000000000000..4663326aadb3514fb095a69f743c12b2d76991ca --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/3304ca1e-2044-4073-a324-d2399a632c79.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d5ed223a33eb1f838a20df28384aece2d9fea8d129db78245f79d52e9a29057 +size 144202 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/334cec47-930e-4ca7-b757-b2ae9882844e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/334cec47-930e-4ca7-b757-b2ae9882844e.lance new file mode 100644 index 0000000000000000000000000000000000000000..c759ad128ad759f9efdcee34b2ec4ffdd10678f2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/334cec47-930e-4ca7-b757-b2ae9882844e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5819ece5cbe9a8f708d1cb5f1a2cb50634ddbd7ea750878a26e30bd86364bb8e +size 139516 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/34887de6-1679-4ed2-8c9e-3b7011cd76f8.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/34887de6-1679-4ed2-8c9e-3b7011cd76f8.lance new file mode 100644 index 0000000000000000000000000000000000000000..785cdd588d8e0a789c7c115cd4d77c620879cc55 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/34887de6-1679-4ed2-8c9e-3b7011cd76f8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed6a80068fe11eeaee6802fa54eb2e65f13f916d27fedb01e7b8a0e78ef700ec +size 139559 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/360b10e6-27bc-494b-acb6-8cb734188574.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/360b10e6-27bc-494b-acb6-8cb734188574.lance new file mode 100644 index 0000000000000000000000000000000000000000..041f253d8ab747d08489c534619b8bf990acd779 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/360b10e6-27bc-494b-acb6-8cb734188574.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0e597d6e08a22c81e25b4dbd33304c74244f7e2be560525fea65874e6e25d7c +size 144331 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/3700d29f-4da4-455a-9492-7da1c4927fa1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/3700d29f-4da4-455a-9492-7da1c4927fa1.lance new file mode 100644 index 0000000000000000000000000000000000000000..da714cb5fbb0ed48dd9a0b6cc122feb28e91749a --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/3700d29f-4da4-455a-9492-7da1c4927fa1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c321de7b15b3d46a544cbdf109ba5d7888537a0ae72c5d30d6bbada3b2496131 +size 145319 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/381d0e47-7fda-46e6-9f55-b9d3600d8114.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/381d0e47-7fda-46e6-9f55-b9d3600d8114.lance new file mode 100644 index 0000000000000000000000000000000000000000..cb5cb4c79e7ff14cdf31b70a90fcdb8f63d62307 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/381d0e47-7fda-46e6-9f55-b9d3600d8114.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c69c9a4fa9cb8a3cafc081204d5a9fafe0e7253ad3fd30ad153fe1ee35760bb +size 141041 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/397200e7-64d1-4f54-ba7c-9f9e1c19fd39.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/397200e7-64d1-4f54-ba7c-9f9e1c19fd39.lance new file mode 100644 index 0000000000000000000000000000000000000000..5d09c85909f04bfb70769948950c1042d0075fc6 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/397200e7-64d1-4f54-ba7c-9f9e1c19fd39.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdfda9db3b7f74d807c12fdc652d8d0a5409562bb62b22af0007f6f8f9da5ddc +size 141884 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/3b585cc2-d28b-4b49-a8de-95af881819eb.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/3b585cc2-d28b-4b49-a8de-95af881819eb.lance new file mode 100644 index 0000000000000000000000000000000000000000..956d77266e14f6298eba38851337dfe430c2a7f6 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/3b585cc2-d28b-4b49-a8de-95af881819eb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95a0c066a79970495843366ddb216a540e4dd9001c806ba84fb91e98845b690b +size 147376 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/3b98e792-612d-42b5-bfe3-c942a36a5f87.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/3b98e792-612d-42b5-bfe3-c942a36a5f87.lance new file mode 100644 index 0000000000000000000000000000000000000000..32042f8c8c637cf858b764d6cc81d14967ec2cfb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/3b98e792-612d-42b5-bfe3-c942a36a5f87.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3c6ab9b3c64c080e34142dbbb83bd57d941217948b9227032995c7062558c1b +size 140969 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/3c0759e1-7eb5-46c7-b355-ecf20c57874d.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/3c0759e1-7eb5-46c7-b355-ecf20c57874d.lance new file mode 100644 index 0000000000000000000000000000000000000000..0baefe4a09cce8239012ba77c26852d85acda898 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/3c0759e1-7eb5-46c7-b355-ecf20c57874d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e88c07c012df1ce9b642a651eec654368b15e38b8893e065acd8975643541ef +size 139758 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/3d2e46ac-a7cb-4749-b1d8-24aa6a3505d6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/3d2e46ac-a7cb-4749-b1d8-24aa6a3505d6.lance new file mode 100644 index 0000000000000000000000000000000000000000..08c46ebb7edc59cc1712f10c14b4913df3e8d862 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/3d2e46ac-a7cb-4749-b1d8-24aa6a3505d6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98f15643187f05d4d17f2d46df53de6f8c64e6d40d3bd00ca1c9b5b91b1bf871 +size 147689 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/3e089f81-69db-441c-a691-8140e59b2ae1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/3e089f81-69db-441c-a691-8140e59b2ae1.lance new file mode 100644 index 0000000000000000000000000000000000000000..45f4da852e50ac124daf78f35e9af1991c15597a --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/3e089f81-69db-441c-a691-8140e59b2ae1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b64f6939a3e51349c5982189bb75a3165b92d399ca94c7a763bb8c02b46bef50 +size 137852 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/3f101d2d-a4c4-4fcb-a627-6f2f2c63f8cb.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/3f101d2d-a4c4-4fcb-a627-6f2f2c63f8cb.lance new file mode 100644 index 0000000000000000000000000000000000000000..1b8e47d37d882864f7d3d825dcf94172e23abe19 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/3f101d2d-a4c4-4fcb-a627-6f2f2c63f8cb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfcae34a320e6e8103c028f0ad4ce6c4daa6c9136f18b2b4a3e7ab1a92be5f4c +size 139760 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/3f40eeb4-6cd0-4a98-8ddb-cedcaee8e761.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/3f40eeb4-6cd0-4a98-8ddb-cedcaee8e761.lance new file mode 100644 index 0000000000000000000000000000000000000000..faff982d4f502d86c1cd22db53a04063f8816fcb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/3f40eeb4-6cd0-4a98-8ddb-cedcaee8e761.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adae3f2f023ea0dc0a2756e6f183c02dd3846c5190d06245196a4c0498ccee4d +size 140102 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4009d7ed-0616-43af-a3de-b9d3606ecaa8.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4009d7ed-0616-43af-a3de-b9d3606ecaa8.lance new file mode 100644 index 0000000000000000000000000000000000000000..7c28b81bad2636861d60be071c13daec1a89f362 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4009d7ed-0616-43af-a3de-b9d3606ecaa8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2642d44c89e2ff8f080af46498c1bd7c2c9acb3ab9569d0debcea00de244918e +size 141060 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/40a87785-7560-4d52-8d0e-0399a5ab7e5a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/40a87785-7560-4d52-8d0e-0399a5ab7e5a.lance new file mode 100644 index 0000000000000000000000000000000000000000..d65797d082037fc341e75b0d2a136c2cef18e9e0 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/40a87785-7560-4d52-8d0e-0399a5ab7e5a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06a918079b46402678aa3b42599956c3d6f45290ebf10832748a09a8c93c980b +size 138341 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/415fda86-b393-4131-9102-39dc12d87db1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/415fda86-b393-4131-9102-39dc12d87db1.lance new file mode 100644 index 0000000000000000000000000000000000000000..279cdbe0bfb8b37047529503d297ca7b6ae23fcb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/415fda86-b393-4131-9102-39dc12d87db1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fd2c686cf12d21044f39f65214f86dd7101d03059f6eff088c4dd3263e51ae2 +size 140584 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4165c826-a5a1-40e3-b725-b509b0a44912.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4165c826-a5a1-40e3-b725-b509b0a44912.lance new file mode 100644 index 0000000000000000000000000000000000000000..650283a961c12ac5fceb1f498aea22caacc480f1 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4165c826-a5a1-40e3-b725-b509b0a44912.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49a128a99a990cbacd33f4e5ee92f003a0839551bde3f9a549d9ec7d7be4ab91 +size 143778 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/41cc4d91-f43a-4635-917f-ffb1d4b9edf6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/41cc4d91-f43a-4635-917f-ffb1d4b9edf6.lance new file mode 100644 index 0000000000000000000000000000000000000000..bfa3608b127595c924221c4e6209bafa58a7e299 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/41cc4d91-f43a-4635-917f-ffb1d4b9edf6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91ada2957f9e452efa8c72930c69fd3a0804a3f132ed750d572baf3890f6c269 +size 150044 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/41e404ea-8b90-4856-ae8a-0249e21313c1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/41e404ea-8b90-4856-ae8a-0249e21313c1.lance new file mode 100644 index 0000000000000000000000000000000000000000..c66a408e283c93d32a0c61bb90cefdb069e8e82e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/41e404ea-8b90-4856-ae8a-0249e21313c1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a249f4e9036021f6d4f56c3c5393c935a04c64608a9da6c2f067b4e65507ec53 +size 137873 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4298a324-de41-4014-b7dc-ec4c8bb536de.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4298a324-de41-4014-b7dc-ec4c8bb536de.lance new file mode 100644 index 0000000000000000000000000000000000000000..9b8914fd8464c0435d4d0210273aa70c8b0d5cda --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4298a324-de41-4014-b7dc-ec4c8bb536de.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:878f0e9ad1e30a634e095dcc4b4bbca096114dfa607e65d288b8d01fcfad2fe5 +size 145386 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/433421c8-aeed-4667-9972-67a4d1ed4de1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/433421c8-aeed-4667-9972-67a4d1ed4de1.lance new file mode 100644 index 0000000000000000000000000000000000000000..9dcff27481498af01a28f7dc450c9200c5d55ec1 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/433421c8-aeed-4667-9972-67a4d1ed4de1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3f92430780edb29179464ee83aa408da51e8ae8e86f0e55e8811239805456be +size 143353 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4451f788-f017-44ac-8919-57a14b7c6341.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4451f788-f017-44ac-8919-57a14b7c6341.lance new file mode 100644 index 0000000000000000000000000000000000000000..c15b804a106ea698956c010bf36420201a8e7491 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4451f788-f017-44ac-8919-57a14b7c6341.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2204e902eac3d261bc225fae32ec629245653fc0ff05bee50ac1cedcce56d185 +size 136737 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/447eb349-953e-4efe-81b2-24c4c565df48.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/447eb349-953e-4efe-81b2-24c4c565df48.lance new file mode 100644 index 0000000000000000000000000000000000000000..bc9652e7ea7a8d4fe8cba8fbbcef4149f8d7d8b8 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/447eb349-953e-4efe-81b2-24c4c565df48.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b57081d602b62569b81b3e4a6678af7ffcda31e28d680a88ccd41fd35d5a11e +size 139218 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/450bd8cf-c7ed-4548-b771-800a14218c51.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/450bd8cf-c7ed-4548-b771-800a14218c51.lance new file mode 100644 index 0000000000000000000000000000000000000000..9c7d7886b6f2f7c5ccf0b1e0d31e324b5523d62d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/450bd8cf-c7ed-4548-b771-800a14218c51.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83f8e84a1159b3daba6b9ca45e5e97a12e5e26e794febaaae876a5cd1340d999 +size 136935 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/46405ddf-9939-4a8f-8b0d-99e522fd7086.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/46405ddf-9939-4a8f-8b0d-99e522fd7086.lance new file mode 100644 index 0000000000000000000000000000000000000000..4aa77246f0f03be9eda1b7bac6807a253221efb0 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/46405ddf-9939-4a8f-8b0d-99e522fd7086.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:409e348378c270d8d0e41474f21d77d48a38031f83a7961644380e1ddf401f14 +size 142020 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/46943ecb-57f1-472d-a949-1ae19408b976.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/46943ecb-57f1-472d-a949-1ae19408b976.lance new file mode 100644 index 0000000000000000000000000000000000000000..c97978dc28d0ccd2b4e2d0ebe9e66e1d10637430 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/46943ecb-57f1-472d-a949-1ae19408b976.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:519ecc6761d360f7c575807addb73aebab8a372b7c761a5bb2fc3784138e92b7 +size 149758 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/479502ce-b4a9-4a61-a387-0550f1375d39.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/479502ce-b4a9-4a61-a387-0550f1375d39.lance new file mode 100644 index 0000000000000000000000000000000000000000..854442d01b9688a76e4e718360f543990ebff086 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/479502ce-b4a9-4a61-a387-0550f1375d39.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d828f952c5cc1d70be43aabc6edc3ea7ac76ace64ed004aa693194a677bf829 +size 139006 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/47e67cfd-c9cb-4a03-a904-f1aff57d182e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/47e67cfd-c9cb-4a03-a904-f1aff57d182e.lance new file mode 100644 index 0000000000000000000000000000000000000000..f84de2e4dd4ac603cf395e91cd891cf457a83e15 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/47e67cfd-c9cb-4a03-a904-f1aff57d182e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e6edf05e6aa8d53d90701556b5aa133ad3dacedf89b866d418ec1b801a16aeb +size 137627 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/495765ad-8d1b-4d8e-a82e-d7c9950c8110.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/495765ad-8d1b-4d8e-a82e-d7c9950c8110.lance new file mode 100644 index 0000000000000000000000000000000000000000..06ed634af2fec652f474f39f1fcaa7d9fbf0d226 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/495765ad-8d1b-4d8e-a82e-d7c9950c8110.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aa99ae5ffdb5a4a2aeb4c0b4155f7302c2b03a7037c7afa8c56f50f1e3f3b1a +size 136200 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/49a1635f-3f32-4562-920a-37a71c549ea6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/49a1635f-3f32-4562-920a-37a71c549ea6.lance new file mode 100644 index 0000000000000000000000000000000000000000..9978729c39db4b5ffdbf46441151732039dd78bd --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/49a1635f-3f32-4562-920a-37a71c549ea6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14f4621702ee79f704db854019ab59304cc9d58ecb979489f382e8e2d037089a +size 142533 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/49fdf622-bef7-4a31-b4bd-642ccd69fb67.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/49fdf622-bef7-4a31-b4bd-642ccd69fb67.lance new file mode 100644 index 0000000000000000000000000000000000000000..017ddb7081c2729f9f46dc9fb2cc9ee06333f973 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/49fdf622-bef7-4a31-b4bd-642ccd69fb67.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e94ca56872a656472650f6b60ae24eb67ff80cf7b5735eb006d1e14e43dcae1a +size 137435 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4b2557db-05c2-4b32-9d36-0f1614d70fb6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4b2557db-05c2-4b32-9d36-0f1614d70fb6.lance new file mode 100644 index 0000000000000000000000000000000000000000..3e872f14c02404afaab259cc8f00db4f62bceff8 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4b2557db-05c2-4b32-9d36-0f1614d70fb6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18f9d564cf97f2de27f5932d4900f864a0519c02d7134ae8e9f0ed8c03b9888b +size 138265 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4c216a74-0eaf-481b-906d-1f5940847873.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4c216a74-0eaf-481b-906d-1f5940847873.lance new file mode 100644 index 0000000000000000000000000000000000000000..8bf8148460cae902c427b9145d76be3a44b54c00 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4c216a74-0eaf-481b-906d-1f5940847873.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d57af00e0199f7c6dac24b077a77ab64b41df1c336a561fa45d1f5d96cab5ff +size 149027 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4c46b1dc-ae07-4233-a45c-70fb5425e73d.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4c46b1dc-ae07-4233-a45c-70fb5425e73d.lance new file mode 100644 index 0000000000000000000000000000000000000000..8ecc7ae9ab72e2e1c8a7cebe8aae24939d9bb378 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4c46b1dc-ae07-4233-a45c-70fb5425e73d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dbf9579a326abb8dddddf709822d63a83672858e900ab085a311079446b71d9 +size 136389 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4c7a6fa5-94b7-4227-9735-67f8cbc33ca7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4c7a6fa5-94b7-4227-9735-67f8cbc33ca7.lance new file mode 100644 index 0000000000000000000000000000000000000000..a559a763de7f41a8de6efd0dcc72cd791e6d9865 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4c7a6fa5-94b7-4227-9735-67f8cbc33ca7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c78bb8be5515744ae2df91bfa0137fa41a0bfa9f05bd4e4839735d0cf7f4fd77 +size 141391 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4c979d3f-75c4-47fb-bdd5-a7be0276e80c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4c979d3f-75c4-47fb-bdd5-a7be0276e80c.lance new file mode 100644 index 0000000000000000000000000000000000000000..38460c848932b13a0b94fdcbdcb16198cad096e5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4c979d3f-75c4-47fb-bdd5-a7be0276e80c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1679ba97a249b46c08997b7b26a8a8d953f14ef17a9b214b619f8e148b6c76e +size 137624 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4d2c92e7-9522-4e61-a2f7-416c48e7deeb.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4d2c92e7-9522-4e61-a2f7-416c48e7deeb.lance new file mode 100644 index 0000000000000000000000000000000000000000..9c854a750d20ff1b172d8e25b3758a9a8b562098 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4d2c92e7-9522-4e61-a2f7-416c48e7deeb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3040a068ed87ea8df0a263747c2d6685eadbf27d22228ced5aa916a2d5d18805 +size 140118 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4d977d5a-7896-48ed-b33b-9037ea5b9ec9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4d977d5a-7896-48ed-b33b-9037ea5b9ec9.lance new file mode 100644 index 0000000000000000000000000000000000000000..b309b09a9e6637373cb0f1d7a4902508e1000de8 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4d977d5a-7896-48ed-b33b-9037ea5b9ec9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c87db141786482978932fe0bafa56dcabccc41f261a0e51b748f8367b469089 +size 142460 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4dca3325-b81b-470d-b16e-f959cbd39785.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4dca3325-b81b-470d-b16e-f959cbd39785.lance new file mode 100644 index 0000000000000000000000000000000000000000..0699b49517ba7bc03e1d26501600be07aa3542b7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4dca3325-b81b-470d-b16e-f959cbd39785.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0152148bd39c591fff3bbf8c4023683abfddd51c80fc8ed495c3b519b27a6a0e +size 140217 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4de9d96f-0809-4b5a-af54-64e53f695fb0.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4de9d96f-0809-4b5a-af54-64e53f695fb0.lance new file mode 100644 index 0000000000000000000000000000000000000000..86fdd315d3987a0291d80dad548974d6c847979c --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4de9d96f-0809-4b5a-af54-64e53f695fb0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:821fcd314ca70e30a23740095e3159d8f3bd5596797ae728390a4e66fa4c3619 +size 142555 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4ee21e17-f630-4364-8290-daa395f83851.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4ee21e17-f630-4364-8290-daa395f83851.lance new file mode 100644 index 0000000000000000000000000000000000000000..7b276f41ef6f05f54f8b135f8addce971fb634e9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4ee21e17-f630-4364-8290-daa395f83851.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93beae2645a60c456f6b2efe2ad78361954d6f86e8bb90a3f2da413b03d4fed9 +size 142154 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4f98c120-bff4-46d3-a0bd-dedc9ac3420a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4f98c120-bff4-46d3-a0bd-dedc9ac3420a.lance new file mode 100644 index 0000000000000000000000000000000000000000..10ceb8836e16ef3e73a82075994bd32eac59fcc5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4f98c120-bff4-46d3-a0bd-dedc9ac3420a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c69345b9496b3bd7060077b53dedd08f9e8d7fc02fa3907ec921ee87ddac9d0 +size 137905 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/4ffcf0f2-7c6d-42fd-8fb5-49eaaa3500a8.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/4ffcf0f2-7c6d-42fd-8fb5-49eaaa3500a8.lance new file mode 100644 index 0000000000000000000000000000000000000000..c5c1efb5f2cb5bc5ea98a3645257919823a0bfa2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/4ffcf0f2-7c6d-42fd-8fb5-49eaaa3500a8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52681333e7854fce94979e5e110e3ea699362862d219a81e2d52ebebb601a72d +size 134790 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5020d7a4-ffe4-4e40-8acd-a60d35c88256.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5020d7a4-ffe4-4e40-8acd-a60d35c88256.lance new file mode 100644 index 0000000000000000000000000000000000000000..332e4967d791b15de04f96e9cf4de2e144b11a8b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5020d7a4-ffe4-4e40-8acd-a60d35c88256.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d2a9dae477133ac45b7de60691216221c0b6cfc7cf84726ea3a20901b8c1d5c +size 140300 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/50b5daed-088c-4ba3-abe7-063f0f644bb8.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/50b5daed-088c-4ba3-abe7-063f0f644bb8.lance new file mode 100644 index 0000000000000000000000000000000000000000..8f8936d7725ec1fabf5d0093156536224a14be43 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/50b5daed-088c-4ba3-abe7-063f0f644bb8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33bc6f5468b81973a2a40a81b61922a7e064994a449f5f5f867a964c42bbefeb +size 141876 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5101f9eb-5a4a-40e3-9e06-29361d351a15.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5101f9eb-5a4a-40e3-9e06-29361d351a15.lance new file mode 100644 index 0000000000000000000000000000000000000000..616513128c88f8c710a7b0df7981c37c424261c4 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5101f9eb-5a4a-40e3-9e06-29361d351a15.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd728ffca029e21a5429897a2bd35c99fda92f4ed611e9289b245f737a963dd6 +size 137914 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5139d76c-b826-4645-8473-299c90a28368.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5139d76c-b826-4645-8473-299c90a28368.lance new file mode 100644 index 0000000000000000000000000000000000000000..7a6590d3839647289be99a72212ba8593381582b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5139d76c-b826-4645-8473-299c90a28368.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a591d3292ddcc7225fe877e0ee8d3ff666aac1b34935271861bf79ca8a32201f +size 142868 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/51851c5a-2900-4c8f-af5b-0d55dff38c8e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/51851c5a-2900-4c8f-af5b-0d55dff38c8e.lance new file mode 100644 index 0000000000000000000000000000000000000000..179ba200d9b33bd07c327f3f61d81ef79dbceb49 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/51851c5a-2900-4c8f-af5b-0d55dff38c8e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffa88ffe3971188466a8a204cdfca5c91a0e2097e2e236bb689309f8a3ee2d9c +size 154926 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/51cf6ea3-7c0e-45ab-a0d3-e75fba343d69.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/51cf6ea3-7c0e-45ab-a0d3-e75fba343d69.lance new file mode 100644 index 0000000000000000000000000000000000000000..8ee116a762a65267b99bd223e9db6219cf2feee7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/51cf6ea3-7c0e-45ab-a0d3-e75fba343d69.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e25b391e3d61fd0320b45134873daa3face181b1413cd38d1bced4948b4b004f +size 143045 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/51d861b9-f55c-43cb-b391-20cd12ce36b3.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/51d861b9-f55c-43cb-b391-20cd12ce36b3.lance new file mode 100644 index 0000000000000000000000000000000000000000..6c1bf2aa58ba53b56bdcf8fe3511be906f71972b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/51d861b9-f55c-43cb-b391-20cd12ce36b3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96a4f80ab5575e52831de155c60fc306742c309811e3c5f38beac26ac021e17b +size 138446 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5268a5dc-c083-48a5-84a5-3702bed084f7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5268a5dc-c083-48a5-84a5-3702bed084f7.lance new file mode 100644 index 0000000000000000000000000000000000000000..9d6e7ef039936463839ab6e67458fce73d0cfc24 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5268a5dc-c083-48a5-84a5-3702bed084f7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d8fb132347934dcc3275a1004fcba2458d582defa5a84f2550a4ebc7cebac1e +size 143064 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/52a656df-5c1c-4bcb-9582-1007408a88e8.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/52a656df-5c1c-4bcb-9582-1007408a88e8.lance new file mode 100644 index 0000000000000000000000000000000000000000..a20489cd258fe8bdb7e019c13da7ff9552ff4fa1 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/52a656df-5c1c-4bcb-9582-1007408a88e8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585fd883f35596b0753f6856b0119f75016a733afa747d4284d3c14644fc10b3 +size 142178 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/53192b6d-7bd7-4798-9808-8ed6aa82717a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/53192b6d-7bd7-4798-9808-8ed6aa82717a.lance new file mode 100644 index 0000000000000000000000000000000000000000..4253dd5e3da1d9bd14560ad72d704a39e38c04e9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/53192b6d-7bd7-4798-9808-8ed6aa82717a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3434d3abdb2ddd1da535af769cb36240fac109729491bd5d902d9b460bd73625 +size 140868 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/540d0512-b355-4af6-b7ec-d9e9723fe6fe.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/540d0512-b355-4af6-b7ec-d9e9723fe6fe.lance new file mode 100644 index 0000000000000000000000000000000000000000..2e5bcf037bfbe47f133e3b9c4b599f1de7dd78bb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/540d0512-b355-4af6-b7ec-d9e9723fe6fe.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59100a33444ff6ad3a082fe0ad79eb234dcbdcece8e99ab942dd88e1f0f0d94f +size 141232 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/57cdc228-9b5a-4828-869a-4bafbdfa028e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/57cdc228-9b5a-4828-869a-4bafbdfa028e.lance new file mode 100644 index 0000000000000000000000000000000000000000..e15c12ba0883c9c14b0bbf81fcc608e23b8399f5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/57cdc228-9b5a-4828-869a-4bafbdfa028e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da1078fc5d1473904d8bd3e1e178f64e6da50d1a173303f103417287a36a222c +size 138856 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/57d070b1-6f4e-4f8e-ba36-3580edba2330.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/57d070b1-6f4e-4f8e-ba36-3580edba2330.lance new file mode 100644 index 0000000000000000000000000000000000000000..1c0d2c92c4b7aa76134f11bb29ad9592a8b20934 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/57d070b1-6f4e-4f8e-ba36-3580edba2330.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38ba5723349f4a62a68c0c1d4f6760e5c002d03e547730a04871c9cebec9cc34 +size 141702 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/586ac2b6-2123-4cd6-8b53-1ed09ceccb74.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/586ac2b6-2123-4cd6-8b53-1ed09ceccb74.lance new file mode 100644 index 0000000000000000000000000000000000000000..34870a45a54111c959f6bea84a7715baf01ddd1d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/586ac2b6-2123-4cd6-8b53-1ed09ceccb74.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb01cc01a817bce8ca033b671ff1835ce09671e4f9f11fae238da1314af6eb50 +size 138462 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/589225da-550b-4242-b375-c8716d7f5b04.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/589225da-550b-4242-b375-c8716d7f5b04.lance new file mode 100644 index 0000000000000000000000000000000000000000..cb35d975cf4b91fcc32c5da3e1a48fc25056b6ab --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/589225da-550b-4242-b375-c8716d7f5b04.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f24d4bdf402e5055773930ecb39778d8aebd5073f33581e0553ef9cc0311d36c +size 138064 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/593716e7-8014-4762-b0c3-0672e8a1f304.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/593716e7-8014-4762-b0c3-0672e8a1f304.lance new file mode 100644 index 0000000000000000000000000000000000000000..2c8eb74a630ff4ba31341c12c726c4dd80a04d71 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/593716e7-8014-4762-b0c3-0672e8a1f304.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a07230beee359d8da5d7890612016edcf444905d73b836dad7158ce2558e90ef +size 147472 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/59d0d722-24d0-496b-bee9-8c4c34e00309.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/59d0d722-24d0-496b-bee9-8c4c34e00309.lance new file mode 100644 index 0000000000000000000000000000000000000000..13d6d983de5b95257f8dae823ec0061ad727dc79 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/59d0d722-24d0-496b-bee9-8c4c34e00309.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3192cb38a0cdc11cdcfa839959fc74eae0a4c7bf16be50445e269123398bf0ef +size 138554 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5a44d98c-0f4c-4689-96ca-ca8e5df47602.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5a44d98c-0f4c-4689-96ca-ca8e5df47602.lance new file mode 100644 index 0000000000000000000000000000000000000000..844ea37357ec24b8010baf39bae22a0987b191ab --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5a44d98c-0f4c-4689-96ca-ca8e5df47602.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a3c9fb8c5e4c934ff63cfb51429f65ea7ba6aeaaf421c73c5405404f30189ed +size 133296 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5abbc3e6-d9c4-47b5-b791-6adcf0bcd837.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5abbc3e6-d9c4-47b5-b791-6adcf0bcd837.lance new file mode 100644 index 0000000000000000000000000000000000000000..275bd39bcd684510030519871205eda12c3ae948 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5abbc3e6-d9c4-47b5-b791-6adcf0bcd837.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:104e8c9dc186ca4d91a30c12fc483f2e47645f5a4998fef32bda7f36c32f7e55 +size 140031 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5ac006e2-2c63-44ad-8d4b-563554c30fe2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5ac006e2-2c63-44ad-8d4b-563554c30fe2.lance new file mode 100644 index 0000000000000000000000000000000000000000..f656fb16438da71111f016520c67439b72fc38ce --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5ac006e2-2c63-44ad-8d4b-563554c30fe2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8030c1fefd903b65d5d557c982b5fc9cf4cb5babc10da993d666850ee32b6290 +size 148485 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5b05b7be-b29e-44b5-b867-b62f8f92a3e2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5b05b7be-b29e-44b5-b867-b62f8f92a3e2.lance new file mode 100644 index 0000000000000000000000000000000000000000..aa932dc2f0429d43ce1ad1a052dc0c2259791428 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5b05b7be-b29e-44b5-b867-b62f8f92a3e2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:457cac622cd4760622441b219928abe0a1ecf396f2d1ed307d08f64a20f29293 +size 138310 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5b895870-ba89-4386-a0b5-c271e4b79c7b.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5b895870-ba89-4386-a0b5-c271e4b79c7b.lance new file mode 100644 index 0000000000000000000000000000000000000000..62bc82895839b9190cbd4cb053ca5bb34feba2ea --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5b895870-ba89-4386-a0b5-c271e4b79c7b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc2618e38d917d149343322dfe171059fd41a273bf8fd971cf63e469b4538079 +size 137449 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5c78b673-2930-4138-b5d2-86da990ba5bf.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5c78b673-2930-4138-b5d2-86da990ba5bf.lance new file mode 100644 index 0000000000000000000000000000000000000000..7f2a142904ad0c73b493eb82e11aabbfd151ccc3 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5c78b673-2930-4138-b5d2-86da990ba5bf.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad633a94353f12c6bd00ef182db5260f951f8283dd830b6097f82da099e176a2 +size 140335 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5ca997f3-2743-409a-a056-17846d469ac2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5ca997f3-2743-409a-a056-17846d469ac2.lance new file mode 100644 index 0000000000000000000000000000000000000000..7148ea688de96a686412462cbe23d3d0bd835e78 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5ca997f3-2743-409a-a056-17846d469ac2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:091e4f28d28f617f13cb7760bff03f8e97f2f8c7c615531ca2224d8bac990e42 +size 145429 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5e70eba7-e16d-49e8-a627-26b809aff481.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5e70eba7-e16d-49e8-a627-26b809aff481.lance new file mode 100644 index 0000000000000000000000000000000000000000..229511e819bd0d376b77e10a2478038be6c37224 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5e70eba7-e16d-49e8-a627-26b809aff481.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26424c86ef67d9368528b749ed0f05743257e0750d5f7aa1a715d2a36f1c1e89 +size 137661 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5e8032a0-c1cc-4c42-9055-fc81dcdfd930.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5e8032a0-c1cc-4c42-9055-fc81dcdfd930.lance new file mode 100644 index 0000000000000000000000000000000000000000..223be88c689c021f615634084ac87d3af8276507 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5e8032a0-c1cc-4c42-9055-fc81dcdfd930.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b6941ae30e3debf8b4777fa18d07c19f1cc64e0fd574b6f4ffb12a2b79c8941 +size 137971 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5f588df5-103f-46bc-8201-3a256870cfa9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5f588df5-103f-46bc-8201-3a256870cfa9.lance new file mode 100644 index 0000000000000000000000000000000000000000..9cae14d7af2ebff0d702e2f850702ab5faa7c8c6 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5f588df5-103f-46bc-8201-3a256870cfa9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5105240ae2d599abb9bd4aa9b2d8b5edcb45a44fb60122d9efabcf39704f9b5 +size 136292 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/5fb767e6-c0b1-4ec8-9693-68825be55f6e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/5fb767e6-c0b1-4ec8-9693-68825be55f6e.lance new file mode 100644 index 0000000000000000000000000000000000000000..5cd6e7b1a56e2a83008a21d71653f87a96ba68ef --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/5fb767e6-c0b1-4ec8-9693-68825be55f6e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:103920138be03fbc397d5e9f9e72be712d36b4536d1b4aa4278a3525ce9a3c5d +size 146475 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/606e7e52-033d-4c8a-8e5f-ede25e1c9502.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/606e7e52-033d-4c8a-8e5f-ede25e1c9502.lance new file mode 100644 index 0000000000000000000000000000000000000000..bc5caefc61c66963b7dc6da88364b3a64c425c85 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/606e7e52-033d-4c8a-8e5f-ede25e1c9502.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7c514007195279deb573845a55c40087b72ed061f4401d97590658790f77679 +size 151634 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/622e74a3-8b19-4eca-87d4-bcef5c3808f9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/622e74a3-8b19-4eca-87d4-bcef5c3808f9.lance new file mode 100644 index 0000000000000000000000000000000000000000..71edfe5f35bba849669f61f11d0a3e1df13bb4cc --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/622e74a3-8b19-4eca-87d4-bcef5c3808f9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32a6affa9e1293a3038abefb977506fac18e716eea6ae1c1a6b52a51826301c1 +size 138713 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/62592043-56ab-4234-8044-9ede9a3a8cf1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/62592043-56ab-4234-8044-9ede9a3a8cf1.lance new file mode 100644 index 0000000000000000000000000000000000000000..c0ed77f0369b38937ad914be4d6c2c188d1eb9d3 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/62592043-56ab-4234-8044-9ede9a3a8cf1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9db5d305415da984b7d3812331a410faf76e669f76d61596da5ee6f1d5d71da7 +size 137961 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/62a08a4f-1e75-4dd6-8a50-6a750afc456a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/62a08a4f-1e75-4dd6-8a50-6a750afc456a.lance new file mode 100644 index 0000000000000000000000000000000000000000..e71664dca6ae3ee3e5f8c5b68d5194f76f525ef9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/62a08a4f-1e75-4dd6-8a50-6a750afc456a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6527218bf30b19c8b78a37d651197663f32057fa0910b46b63ad17655b1a17e +size 147798 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/63dbc6e7-15a9-4545-88c0-8252d42362f3.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/63dbc6e7-15a9-4545-88c0-8252d42362f3.lance new file mode 100644 index 0000000000000000000000000000000000000000..bb96a151806dda237fa70b703dff8e974b10dda7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/63dbc6e7-15a9-4545-88c0-8252d42362f3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37955bf6b29e221b9933faa23ff5c486d062db12a5b1fb4fdf1f806ef37b877f +size 139065 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/647255f9-f822-4ca4-8ad7-945b7c54f6d2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/647255f9-f822-4ca4-8ad7-945b7c54f6d2.lance new file mode 100644 index 0000000000000000000000000000000000000000..e2ab819d1d7a33adcfa0561707f7b21e84414d50 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/647255f9-f822-4ca4-8ad7-945b7c54f6d2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc5180a06bd4ec70b45d38470e1d65e9c9ebdd46264a89dc269e9720d2e57e0d +size 147519 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/647611db-8279-406e-85a5-641a238a63ea.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/647611db-8279-406e-85a5-641a238a63ea.lance new file mode 100644 index 0000000000000000000000000000000000000000..b12c76fa40cdb01dd61226b6b8bf95745e8197eb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/647611db-8279-406e-85a5-641a238a63ea.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24b14bf61b2f72372977605a7b2ef955c864abf2628018a6bbea370137b7c7fd +size 142153 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/64ba66c6-5ab3-4fc8-bed4-1d859ca6c492.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/64ba66c6-5ab3-4fc8-bed4-1d859ca6c492.lance new file mode 100644 index 0000000000000000000000000000000000000000..ffca4c91dd39a9eda21dcc395d4134622d10f81f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/64ba66c6-5ab3-4fc8-bed4-1d859ca6c492.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:147ba8ebda1c4450695c084e182a36b06427bd513d82b85ab99a3c0c49b59f53 +size 143996 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/66868023-bdda-44ee-88f1-8f806f2bc0d0.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/66868023-bdda-44ee-88f1-8f806f2bc0d0.lance new file mode 100644 index 0000000000000000000000000000000000000000..6fe6d22cd7be3715fa95f4537687f69aa42ba453 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/66868023-bdda-44ee-88f1-8f806f2bc0d0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:103d112a4deded9fa42786edab3d0a8b872d96e045a520027e0b52627ed1e632 +size 140757 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/66a55f79-d3e3-4d5e-8439-0bbae7a246bc.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/66a55f79-d3e3-4d5e-8439-0bbae7a246bc.lance new file mode 100644 index 0000000000000000000000000000000000000000..4bcc85c2e5c09bde0426b85ff338b49d1676e90a --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/66a55f79-d3e3-4d5e-8439-0bbae7a246bc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:376b1c36238d9a7d045a6973d372b6c22816363bfda9df8a14fc34fb553f0c61 +size 135541 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/67000311-1c47-4035-955f-a4b71a1017da.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/67000311-1c47-4035-955f-a4b71a1017da.lance new file mode 100644 index 0000000000000000000000000000000000000000..39513beca83f112ae394f47b91d471e55504725a --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/67000311-1c47-4035-955f-a4b71a1017da.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92a50ca4fea275e5c05332bd5abb038291d1f12465d19b84c9f3e0bdb6022f5f +size 148673 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/67f1035d-c265-4afd-ba6c-8a4f3db2c34f.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/67f1035d-c265-4afd-ba6c-8a4f3db2c34f.lance new file mode 100644 index 0000000000000000000000000000000000000000..65c4b0d3c759ed782ef76947cc57232810223980 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/67f1035d-c265-4afd-ba6c-8a4f3db2c34f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ff2c8599a26d5ee715f72e6dbc78e0da666e1203a47aa2b9b4d8f9ea954aa7e +size 151460 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/68619156-9243-4689-9455-a0aff7e6f776.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/68619156-9243-4689-9455-a0aff7e6f776.lance new file mode 100644 index 0000000000000000000000000000000000000000..11b586d58b4b6dbed2753f52e71daf7e692b541a --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/68619156-9243-4689-9455-a0aff7e6f776.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcbff6d9c341c00fa5094d500ea08330aebd51c4b2c36c2b1abfc4abfc5eb938 +size 138595 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/68ba2665-aea6-44fa-b138-6884a0d73b74.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/68ba2665-aea6-44fa-b138-6884a0d73b74.lance new file mode 100644 index 0000000000000000000000000000000000000000..89e88d9d790043cdfc02c714f8266dd796cfe703 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/68ba2665-aea6-44fa-b138-6884a0d73b74.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d6d160e89ccdeef95490991221fcd23472e33e14609018a0f63bdedb0159ef7 +size 139423 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6903a240-7269-48aa-8fa7-688e8f3da99b.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6903a240-7269-48aa-8fa7-688e8f3da99b.lance new file mode 100644 index 0000000000000000000000000000000000000000..9b7b40e67770ac511e3a2cee0a0b07d5fb310ade --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6903a240-7269-48aa-8fa7-688e8f3da99b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be41922116469903cc6cbe8d91b4ad71ace16177f26009baa400a4ce326a930e +size 140653 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6bab259f-8818-4aca-8689-8448ec34042d.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6bab259f-8818-4aca-8689-8448ec34042d.lance new file mode 100644 index 0000000000000000000000000000000000000000..d56e41dba92b1ab12170d516ea26edaff31828a4 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6bab259f-8818-4aca-8689-8448ec34042d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bc383ea765beab5b044f85b0695242e9d46b78cfc2cb611f036ce5f53ab55e7 +size 144239 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6c4ce6f8-2464-49f2-aaa6-80fc10f3bd64.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6c4ce6f8-2464-49f2-aaa6-80fc10f3bd64.lance new file mode 100644 index 0000000000000000000000000000000000000000..0cf58c9e797b3a097597d3f41682ae982f6fbad9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6c4ce6f8-2464-49f2-aaa6-80fc10f3bd64.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83dbe0885748fd2d832742a512f8c1a910c016f3623b799401ccb360786eb452 +size 137904 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6c8823cd-cfe8-4478-a03f-9cde86c2037f.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6c8823cd-cfe8-4478-a03f-9cde86c2037f.lance new file mode 100644 index 0000000000000000000000000000000000000000..c9b2571360923419ddf968268ec2f88a348bfb03 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6c8823cd-cfe8-4478-a03f-9cde86c2037f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ce55f53c7faed4c63b29710cd988fb74e852f8ad4ecd4c3fdd8c8b70877dcfc +size 139070 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6caf922e-277c-4b5c-99dd-f1d28f999e9d.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6caf922e-277c-4b5c-99dd-f1d28f999e9d.lance new file mode 100644 index 0000000000000000000000000000000000000000..8afa9a9a28b03250044a08b57c22fc205c4e87eb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6caf922e-277c-4b5c-99dd-f1d28f999e9d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a61a05146fa538b0c6357faf393a472c05d41e1c5da8974cc5a1a3e1db1d7a02 +size 138737 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6cb69446-9618-49a5-95ad-e8c2e40e8e84.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6cb69446-9618-49a5-95ad-e8c2e40e8e84.lance new file mode 100644 index 0000000000000000000000000000000000000000..1bc4364416835437a2c945476f41a6bece5c8234 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6cb69446-9618-49a5-95ad-e8c2e40e8e84.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7404f5ef5b7443ca7d4d70c2afa706f83c658e36e52e1ec5cc5e0d42e9f78dfc +size 137047 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6ce427cc-d338-4493-bc95-56a981f8a0cc.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6ce427cc-d338-4493-bc95-56a981f8a0cc.lance new file mode 100644 index 0000000000000000000000000000000000000000..bd492d5e58b7c1d9626033d058eef675a4ee86b8 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6ce427cc-d338-4493-bc95-56a981f8a0cc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15e5fbf622468b8705c70ad2e4600b151569a4097d818193d53ea4ab9b4e3cf6 +size 140620 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6d1ce16f-81ee-409d-a91c-7b4c6bc4dddd.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6d1ce16f-81ee-409d-a91c-7b4c6bc4dddd.lance new file mode 100644 index 0000000000000000000000000000000000000000..48412afc6beed5674ba2aa2b8060967b1ce0061d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6d1ce16f-81ee-409d-a91c-7b4c6bc4dddd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa3cc61b71029e0b6ba3d21489cfd2b344c39765000652f2cc4008ea03eab420 +size 140252 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6d3e1f47-cb8c-4b7c-8772-8a39052b12c0.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6d3e1f47-cb8c-4b7c-8772-8a39052b12c0.lance new file mode 100644 index 0000000000000000000000000000000000000000..a6b2611b572f8c31b28c838050ff1aad3936cd48 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6d3e1f47-cb8c-4b7c-8772-8a39052b12c0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f80b4d793f786166b49157891ff1a4f805c11ff0261eacd993cbb1d01daa92cc +size 138765 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6e57b62d-618d-4797-bc16-5679d7ab245b.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6e57b62d-618d-4797-bc16-5679d7ab245b.lance new file mode 100644 index 0000000000000000000000000000000000000000..71dc343ecd16ef607499636f4b1124a045a35741 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6e57b62d-618d-4797-bc16-5679d7ab245b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aaebe2397f66d8572dcac29387341d8627af4e8156ed7d072b3dbe26505a158 +size 137804 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6f45830c-b523-45ef-a300-0076d8176163.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6f45830c-b523-45ef-a300-0076d8176163.lance new file mode 100644 index 0000000000000000000000000000000000000000..71fe944528d37220c8bbf3bb31be98ecfeca98d5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6f45830c-b523-45ef-a300-0076d8176163.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80f565da8d3093d1784e2973c3b96ae2bde5c24d8dfced7f8d028420663a30fc +size 140047 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6fc97d21-ba2e-439c-8bba-d1bfd7f9c1b2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6fc97d21-ba2e-439c-8bba-d1bfd7f9c1b2.lance new file mode 100644 index 0000000000000000000000000000000000000000..37a6bd394d60d5beb872775a9526e62b7603088a --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6fc97d21-ba2e-439c-8bba-d1bfd7f9c1b2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70bf9be7078324b8e47537568e8183af2b75988531c55d0d562b3ef9fdababc4 +size 138353 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6fd73e0f-f238-42a8-bddb-4203e904b88e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6fd73e0f-f238-42a8-bddb-4203e904b88e.lance new file mode 100644 index 0000000000000000000000000000000000000000..27a63e8b68f3ee7aa0297e37817f3f8242271331 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6fd73e0f-f238-42a8-bddb-4203e904b88e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f96e67a9cdb68e1d5585f3004ed68179ab4886649b8f67c1cc8c48e29e0d9ddc +size 142777 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/6ffac0cc-11f0-4421-9a65-e1cd337d1046.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/6ffac0cc-11f0-4421-9a65-e1cd337d1046.lance new file mode 100644 index 0000000000000000000000000000000000000000..9a2e12bcf9547483a3df23b95965252a121827f7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/6ffac0cc-11f0-4421-9a65-e1cd337d1046.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:016415361339137681195f7894f8fbece4a4ca3d0472374796baa142504817e7 +size 145805 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/70e039b6-83ad-4deb-b908-57a32026c2da.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/70e039b6-83ad-4deb-b908-57a32026c2da.lance new file mode 100644 index 0000000000000000000000000000000000000000..93971f08b89c7dc936ab3da6b99357bb2f0a5adb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/70e039b6-83ad-4deb-b908-57a32026c2da.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:165d6362b17b151a0726b82dffdcfd7eaa34afa4b9c86a3185cb47446d4b36b9 +size 137138 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7158c29a-cef1-4267-8f6b-9e1a83d766ce.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7158c29a-cef1-4267-8f6b-9e1a83d766ce.lance new file mode 100644 index 0000000000000000000000000000000000000000..dcfb7a49fdd4c30ce2878730b841278b7bbbbfb0 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7158c29a-cef1-4267-8f6b-9e1a83d766ce.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a395e6db9878e47cae50c00050dac58a3df8ea4d52de7af62f96fab29d21158 +size 148846 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/71afaff4-6b7c-4c7a-a460-dca66e9df694.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/71afaff4-6b7c-4c7a-a460-dca66e9df694.lance new file mode 100644 index 0000000000000000000000000000000000000000..ca4754a923eea2cf7540bbadef68d5c0ddec321c --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/71afaff4-6b7c-4c7a-a460-dca66e9df694.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1751d52c31ba4862083a781f96bdfc1be4bcde292866caffed4e84cd1dc5e3 +size 145234 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/71b139bc-e2fd-4748-a451-9c8acded7681.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/71b139bc-e2fd-4748-a451-9c8acded7681.lance new file mode 100644 index 0000000000000000000000000000000000000000..0a4383f88d9c4d2af6a9ce9ee8ac45e99ea44dec --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/71b139bc-e2fd-4748-a451-9c8acded7681.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae3a8979f9e5971fcdf266b52de3793ba22ac0aff7fee4a20adf5a3298cfccdf +size 137661 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/73748294-f417-41ce-97af-7eb4253aac25.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/73748294-f417-41ce-97af-7eb4253aac25.lance new file mode 100644 index 0000000000000000000000000000000000000000..0626b44ad32c149aad03d32a407cb0d3d29b599f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/73748294-f417-41ce-97af-7eb4253aac25.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f098b13359a20247f2033151a35040a05760ff333ef48b7636bb8696aa759e59 +size 140884 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/744d4d0d-8a9d-426e-803b-64d8f8497538.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/744d4d0d-8a9d-426e-803b-64d8f8497538.lance new file mode 100644 index 0000000000000000000000000000000000000000..e9d93ad30550145d533389ebd2bd3af855ed9042 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/744d4d0d-8a9d-426e-803b-64d8f8497538.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:794b31d67ec0ff7bf81cbfcee2fb0f3525ac37f035e9d2ebc8d04572e64c9741 +size 135865 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/753ac75e-41b5-4425-9fce-9c1b6e62edb5.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/753ac75e-41b5-4425-9fce-9c1b6e62edb5.lance new file mode 100644 index 0000000000000000000000000000000000000000..d30a6ec5e3853dd3e3801d2b215b0f2da59ec8f3 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/753ac75e-41b5-4425-9fce-9c1b6e62edb5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e96d640dae60f98ec0ee4611d0609d95be3659fb764a759f80553d795bb24218 +size 145818 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/759a1a5a-56d8-4494-991a-bc9fbc69aec6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/759a1a5a-56d8-4494-991a-bc9fbc69aec6.lance new file mode 100644 index 0000000000000000000000000000000000000000..2a8e01d6bd0600d66419b217b80ad8f077b9df12 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/759a1a5a-56d8-4494-991a-bc9fbc69aec6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45fbde4018195a0941e182bc4817748511d389feeeb4e3d23c063817ee8a6352 +size 148701 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7624c797-22ec-48ea-9978-5be180c77c5f.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7624c797-22ec-48ea-9978-5be180c77c5f.lance new file mode 100644 index 0000000000000000000000000000000000000000..fe4fbe8be14e0d28a193dcdae2c389d095b9ffd8 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7624c797-22ec-48ea-9978-5be180c77c5f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f8e3338e93f488de6b532c268822fd9b7bd3c965b9d241957147228acbb91db +size 139469 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/76accc1f-c757-48fe-a92a-895867c94197.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/76accc1f-c757-48fe-a92a-895867c94197.lance new file mode 100644 index 0000000000000000000000000000000000000000..c6585925d2681d5389e41bfc81dc234569be4162 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/76accc1f-c757-48fe-a92a-895867c94197.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c36b964d25175f4e2ebae05c8dfedac46294f5870a7e9647a4ea6edbf981cd74 +size 146552 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/76ca3457-4051-4a97-bec9-dee02d101ee0.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/76ca3457-4051-4a97-bec9-dee02d101ee0.lance new file mode 100644 index 0000000000000000000000000000000000000000..f77f29bf84ebf2e7e9db92b092ec9ab7aef3bcd7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/76ca3457-4051-4a97-bec9-dee02d101ee0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81dd850eee90bc79dc6f21dbc624c654e7cf49f031ebffdbcac1e61a7b1e884e +size 150277 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/76ca4246-2ef4-4059-87a7-46f78a7da4cb.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/76ca4246-2ef4-4059-87a7-46f78a7da4cb.lance new file mode 100644 index 0000000000000000000000000000000000000000..3b8062bbdc90a1cd22822d9675429e227a0f0a64 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/76ca4246-2ef4-4059-87a7-46f78a7da4cb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef54b81ed0c4db7271b30d0f07692394f6424b95a5d06db41e8ea2fb89007aa1 +size 137240 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/76d104c9-1f00-4e02-b676-46465e798103.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/76d104c9-1f00-4e02-b676-46465e798103.lance new file mode 100644 index 0000000000000000000000000000000000000000..be7acca13e10cbff8a13e39b3c739ae8d02a2280 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/76d104c9-1f00-4e02-b676-46465e798103.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be185b507ed6abd7b875aa6639ff5b46aa129b302e84fb04712e3943416d728b +size 140733 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/77415be4-eef5-4da0-ab9e-9aae0becc7ea.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/77415be4-eef5-4da0-ab9e-9aae0becc7ea.lance new file mode 100644 index 0000000000000000000000000000000000000000..b43a10835331f965a5a03a35a10368ffd3507379 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/77415be4-eef5-4da0-ab9e-9aae0becc7ea.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f69bc3a7eea150a518bb3a34ae114ad8a879ef722233d12d8fd19154e7d5d2a8 +size 135742 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/77b0a058-c50e-4b68-89eb-d3f993095870.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/77b0a058-c50e-4b68-89eb-d3f993095870.lance new file mode 100644 index 0000000000000000000000000000000000000000..9b63306601cb38055abffab0aaac3f899245945c --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/77b0a058-c50e-4b68-89eb-d3f993095870.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffcd3fc3fb07c8e290f7e4349d0f646489f6f33f18bb490b5317f2980d6ae456 +size 142122 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/77d91a4f-3aba-4220-8863-c6d4cd0c033e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/77d91a4f-3aba-4220-8863-c6d4cd0c033e.lance new file mode 100644 index 0000000000000000000000000000000000000000..b56f0ddf168048e8b05c574ca512582bab108a24 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/77d91a4f-3aba-4220-8863-c6d4cd0c033e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:354f9040193d00763bd6ca1be601875eb102ba3395d729d61995059d7d25d491 +size 140923 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/78a07c5b-4725-4968-a4ee-bfdf0ca2cba1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/78a07c5b-4725-4968-a4ee-bfdf0ca2cba1.lance new file mode 100644 index 0000000000000000000000000000000000000000..715f700eb8ca65e6ebd079c0db095d8c79493d7f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/78a07c5b-4725-4968-a4ee-bfdf0ca2cba1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aa89ebe02b8fc52783822eaabfd308b97f08a4a3fbc033c0498901cb0824fee +size 135912 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/78e31b0f-789d-4a15-b513-d57bc94631b0.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/78e31b0f-789d-4a15-b513-d57bc94631b0.lance new file mode 100644 index 0000000000000000000000000000000000000000..392dcca5ce461f4a970ad392bae902eb1b5c721e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/78e31b0f-789d-4a15-b513-d57bc94631b0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9b39a79a20d24a24f15cdf1a04284e9fb4f9588bde013449fcc720c87164d30 +size 139918 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/78fef96f-d2aa-47c0-a0bc-0e82a65abf10.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/78fef96f-d2aa-47c0-a0bc-0e82a65abf10.lance new file mode 100644 index 0000000000000000000000000000000000000000..48bf1c11195b6dc417d37af61629ae7f63e5daa7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/78fef96f-d2aa-47c0-a0bc-0e82a65abf10.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a574ac4ca385de25545e7f16207e868e79e6bc7cd112c2ce64db3ff3a205722 +size 148841 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/793fd568-2aad-445c-971e-6c2a83873f21.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/793fd568-2aad-445c-971e-6c2a83873f21.lance new file mode 100644 index 0000000000000000000000000000000000000000..9cc8755244f5feb5d9fa56a2b75dd2be5a42bb93 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/793fd568-2aad-445c-971e-6c2a83873f21.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e1cf2ee79c4185ef6d9e5657a858ca68a4df9ddbf320220210963c6abda134 +size 141585 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/79698cc5-1d5a-48db-8745-3fb833ca3dc6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/79698cc5-1d5a-48db-8745-3fb833ca3dc6.lance new file mode 100644 index 0000000000000000000000000000000000000000..c959231f494993ce27e709d732a1d267d2d65036 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/79698cc5-1d5a-48db-8745-3fb833ca3dc6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:229d92636daa6974a552ac2671704a5fd017d91721c2e336e46c9a0666bdda44 +size 134829 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/79f77438-4578-4574-ab76-7fe794895c6f.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/79f77438-4578-4574-ab76-7fe794895c6f.lance new file mode 100644 index 0000000000000000000000000000000000000000..78985d8121944113c8a062825ac26d063dde80fa --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/79f77438-4578-4574-ab76-7fe794895c6f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6f20b4ebb571dfe0a7ac4f24d0f64e793410f1c3ee1a571614fdcd7b5fa0cbc +size 141822 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7b4909de-c0d1-4176-9d46-2dfff11b17c6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7b4909de-c0d1-4176-9d46-2dfff11b17c6.lance new file mode 100644 index 0000000000000000000000000000000000000000..5de120791fee7aa1c3649a84e24c38c2832b5cb2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7b4909de-c0d1-4176-9d46-2dfff11b17c6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4acfce81eb997d5212d80aee3a44d0b027eb6b8ef42fcf83f83c94cae6e09eda +size 139974 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7bc1ea9a-3cf1-4779-8ad6-074e86a1c886.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7bc1ea9a-3cf1-4779-8ad6-074e86a1c886.lance new file mode 100644 index 0000000000000000000000000000000000000000..157d7c5fc9094dbc329fd6decf0723e74cfbb13e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7bc1ea9a-3cf1-4779-8ad6-074e86a1c886.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4e77ab7c862053fc8c6575f34dc53e94385242d0d74694b49bc88d36db4d4ba +size 148718 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7bd95075-8a85-4106-a638-37679acae269.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7bd95075-8a85-4106-a638-37679acae269.lance new file mode 100644 index 0000000000000000000000000000000000000000..d3fbc2862f59c8fa457e343fcd483714cd3bf860 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7bd95075-8a85-4106-a638-37679acae269.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26dae89fb5f9d5311f2ff812ca227d6e9b7a5ea8039c7ccdf0073aa15a1b981f +size 138889 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7c7738bc-4aac-4b17-8bfa-479525dca9eb.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7c7738bc-4aac-4b17-8bfa-479525dca9eb.lance new file mode 100644 index 0000000000000000000000000000000000000000..180e8d04a535d863d061609e5ff7226d51e21a71 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7c7738bc-4aac-4b17-8bfa-479525dca9eb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49e4713b5a693cbdb3ea3334b6f1a5f8668554fa437b4259df5599682d875a46 +size 143527 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7c9ada86-f729-4595-81cb-ba27f1343e3f.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7c9ada86-f729-4595-81cb-ba27f1343e3f.lance new file mode 100644 index 0000000000000000000000000000000000000000..5139a071b0292cfb98130b89ece6c4db4dd2d150 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7c9ada86-f729-4595-81cb-ba27f1343e3f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8aa59fdc33d9d9fa226747257e9ed2710e27426281c6690898fb74415a3341c +size 143413 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7d2f7eae-4ced-4912-ad06-bf2d321ee01c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7d2f7eae-4ced-4912-ad06-bf2d321ee01c.lance new file mode 100644 index 0000000000000000000000000000000000000000..6c8ef84fb0ad93d10c174898690662e5709d8020 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7d2f7eae-4ced-4912-ad06-bf2d321ee01c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed9283da6a5ea21b27b15d9992db55bd08de9a5a9fd406a3f3662488c09db648 +size 140092 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7d4c4a52-e930-49ab-912e-27188bc10506.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7d4c4a52-e930-49ab-912e-27188bc10506.lance new file mode 100644 index 0000000000000000000000000000000000000000..24bc9440e7aae36db76c7cdb5d1f4124a3898b59 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7d4c4a52-e930-49ab-912e-27188bc10506.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26b2aa919150f2e840e95c8cfbb848ce6331b18badc903424ea8b3343876437a +size 142644 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7d86c303-847b-41f4-bf57-e16ef2fa5d04.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7d86c303-847b-41f4-bf57-e16ef2fa5d04.lance new file mode 100644 index 0000000000000000000000000000000000000000..6f4dba121233033e2db2ac0f3b95c55823e7580f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7d86c303-847b-41f4-bf57-e16ef2fa5d04.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b9c8d1fdc158539b9b689465b8dfd9628303ec26e73e5fd843ca2ad3b95c81e +size 139920 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7df440da-4ea1-4d78-af7a-7fa939236c73.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7df440da-4ea1-4d78-af7a-7fa939236c73.lance new file mode 100644 index 0000000000000000000000000000000000000000..8b7263bae1573246bcab2590af0a045bb07a8941 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7df440da-4ea1-4d78-af7a-7fa939236c73.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f1d25e620c8ee8e6ccc167248adf9c242e518a7b6066b2e6ec4eec852c2acae +size 137877 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7e6dcdad-063a-43c4-8b0e-9d100cb076f9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7e6dcdad-063a-43c4-8b0e-9d100cb076f9.lance new file mode 100644 index 0000000000000000000000000000000000000000..9fb87c5b700f61e5cfe9fec031a68cabbf4fb42c --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7e6dcdad-063a-43c4-8b0e-9d100cb076f9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb34b09cc5803be9e8800cfad869b664c5c7e852e25b0f8aec07c82eed3dfc6 +size 144748 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7e87013a-c79b-4c1b-8b63-9ab96293f0f1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7e87013a-c79b-4c1b-8b63-9ab96293f0f1.lance new file mode 100644 index 0000000000000000000000000000000000000000..c2c17b670a0afd11594d9c65688e3abe6d2f7899 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7e87013a-c79b-4c1b-8b63-9ab96293f0f1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3617cba42fd8eb4d1c80cd9776ace672586571d50e719cbb3a5a903f04eab04d +size 146857 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/7f0407ed-e538-4fcc-a417-2c44365322f8.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/7f0407ed-e538-4fcc-a417-2c44365322f8.lance new file mode 100644 index 0000000000000000000000000000000000000000..3073b33f425fb8885f3adcfb6b896bd7b2426438 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/7f0407ed-e538-4fcc-a417-2c44365322f8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bb8e116fddd5ccb7404e09ee4f797a09b395ae6be3968cc2c2248630b52cba8 +size 140827 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/815473d9-5e1b-4c8d-9483-206b07f8a4db.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/815473d9-5e1b-4c8d-9483-206b07f8a4db.lance new file mode 100644 index 0000000000000000000000000000000000000000..a909c7d4b729338aa129c9c1679025825976046f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/815473d9-5e1b-4c8d-9483-206b07f8a4db.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ee6a76d9d8a873929110e4182bae9e7082421cf7c6f51cb7ee5d02bc1e91a75 +size 139820 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/83e26006-5cc6-443d-9292-5baa39d3acf9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/83e26006-5cc6-443d-9292-5baa39d3acf9.lance new file mode 100644 index 0000000000000000000000000000000000000000..88cff7aa2e624c7d8ac3befb2832301a1baadb16 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/83e26006-5cc6-443d-9292-5baa39d3acf9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a072a72586465045d99d8f71913a99db8c489aced31e0464f67590812568cd1a +size 138077 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/849010c5-c085-454c-8bfa-4d011f50c1b7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/849010c5-c085-454c-8bfa-4d011f50c1b7.lance new file mode 100644 index 0000000000000000000000000000000000000000..99419c5f1c56ddb1becd4b151d4fdbd9ba88379e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/849010c5-c085-454c-8bfa-4d011f50c1b7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec38cd2ecda8652d50f3421957ac21ccbf557e39834351b0bb7ee0b9b8ecfb5d +size 148239 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/84973801-5ac9-47ce-a060-cb4d011ae62c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/84973801-5ac9-47ce-a060-cb4d011ae62c.lance new file mode 100644 index 0000000000000000000000000000000000000000..1cad13cead5ac2f00481c8dcea0eabcb03deb9c5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/84973801-5ac9-47ce-a060-cb4d011ae62c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7b166bd87e286e2de156f7563677d0a6294bb5e6e0ee06766b7d92398a5cf40 +size 142194 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/8520b645-8acb-41e2-9988-49b321d14b62.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/8520b645-8acb-41e2-9988-49b321d14b62.lance new file mode 100644 index 0000000000000000000000000000000000000000..37907d98c656ae5c751ac0d5845420b0be2ad37f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/8520b645-8acb-41e2-9988-49b321d14b62.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b0e0843acb0cc70c14412f7d98508263cb4df18a91d3552a25594fd1f86179a +size 138437 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/87f5ca9d-27d4-41c5-a651-1bc6fbb2da76.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/87f5ca9d-27d4-41c5-a651-1bc6fbb2da76.lance new file mode 100644 index 0000000000000000000000000000000000000000..71413906cac4d07bc791e5f25f486be2e3f0e0be --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/87f5ca9d-27d4-41c5-a651-1bc6fbb2da76.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60fc0c66e3f8705e4981986d024a6fe198d4dafc544b046a30b841cae31a6751 +size 144220 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/8ab3b703-517c-4c2d-ba7f-9a4380b51909.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/8ab3b703-517c-4c2d-ba7f-9a4380b51909.lance new file mode 100644 index 0000000000000000000000000000000000000000..24def6c072c55de690f3331ce826b0ce0cf54a33 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/8ab3b703-517c-4c2d-ba7f-9a4380b51909.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62906d1c7070639cf6eb8c4225bd6d1d4b9584e44b108b6a132785dfb3b50d90 +size 137417 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/8bfa20e6-a270-43a8-a895-bcba5f578c95.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/8bfa20e6-a270-43a8-a895-bcba5f578c95.lance new file mode 100644 index 0000000000000000000000000000000000000000..27cf45182c8740560d195483f4dc354d28029d59 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/8bfa20e6-a270-43a8-a895-bcba5f578c95.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a34bf7d66e3382cda476b204b645dc002242eeab3e7ced08c309b62bac79218 +size 137142 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/8c14e9a0-da21-49f2-80fc-cfe63fb0eeea.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/8c14e9a0-da21-49f2-80fc-cfe63fb0eeea.lance new file mode 100644 index 0000000000000000000000000000000000000000..25a05d4d3dbd4333c0aa193beb09982df49a4232 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/8c14e9a0-da21-49f2-80fc-cfe63fb0eeea.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c93ba112f46adef8446ac1c95fa2ff7a03f67486a364d678a1744d099e05058 +size 141039 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/8cc05c1b-6b1e-4afb-b1fc-e477725fb1c2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/8cc05c1b-6b1e-4afb-b1fc-e477725fb1c2.lance new file mode 100644 index 0000000000000000000000000000000000000000..7dfa4a6471e0d06ddbc0377006e551d630337956 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/8cc05c1b-6b1e-4afb-b1fc-e477725fb1c2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a13313cc20d673280e0c8f2d562148fccabc372bdd6a3571ab2449b034b9f52 +size 140250 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/8dad98b9-1384-4e1c-a003-b02071ccc2f4.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/8dad98b9-1384-4e1c-a003-b02071ccc2f4.lance new file mode 100644 index 0000000000000000000000000000000000000000..c3fd00dbd7e2c021a9abca87fbf57ae5fa395ba9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/8dad98b9-1384-4e1c-a003-b02071ccc2f4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc48086f090269d63ce13fe16429f1b7bd2f38c0c74d0cdfe4a3bbe13cb09c1f +size 140218 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/8de65e39-a2f7-45d6-b153-1c739492f409.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/8de65e39-a2f7-45d6-b153-1c739492f409.lance new file mode 100644 index 0000000000000000000000000000000000000000..830c8abd4d12707df8b911f1e694a633001a1f2f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/8de65e39-a2f7-45d6-b153-1c739492f409.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12858450b8fbd05fcea99869023ec44e97038f2111a3a5d910c8e3496c9a697f +size 141043 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/8fa0f14a-ba2b-4e18-9c63-c496bbb6741a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/8fa0f14a-ba2b-4e18-9c63-c496bbb6741a.lance new file mode 100644 index 0000000000000000000000000000000000000000..6b7cbbbdecfd66ef0633a3dc9007dacd5cbd138f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/8fa0f14a-ba2b-4e18-9c63-c496bbb6741a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f880314785c45107b174baab3f2b79ebdbcc9441d10d80343b35be83f643d2f +size 134683 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/9052fc67-d1b9-415d-ad33-007fb61fd382.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/9052fc67-d1b9-415d-ad33-007fb61fd382.lance new file mode 100644 index 0000000000000000000000000000000000000000..1c33374b4b1d131b76759f184bb57f83cf7ed143 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/9052fc67-d1b9-415d-ad33-007fb61fd382.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78aadb6775cde6aa1529869fe2c10841421a8459fcc3e7e2a79ffe8f63092082 +size 140723 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/905686ed-1e96-4a46-b3ce-e79c8aac2f3e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/905686ed-1e96-4a46-b3ce-e79c8aac2f3e.lance new file mode 100644 index 0000000000000000000000000000000000000000..8ff8db8343cd9e87abead06f23d16ebd0e165b99 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/905686ed-1e96-4a46-b3ce-e79c8aac2f3e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58e9e3ddb72ad6a6c6d8bafd93886e2f6438bbd23899d75d62ab2393b4d9485a +size 145265 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/907a351b-d69f-4c7d-928b-1fb3169e0971.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/907a351b-d69f-4c7d-928b-1fb3169e0971.lance new file mode 100644 index 0000000000000000000000000000000000000000..eeacf9a78eab1e72171ba61135db4432a9450275 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/907a351b-d69f-4c7d-928b-1fb3169e0971.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1595a7c0097e45e1d040264f7f4955bd463d78730e7b90482d87ffde65470cd5 +size 138226 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/923229cb-d699-4cc5-b850-284707b96e12.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/923229cb-d699-4cc5-b850-284707b96e12.lance new file mode 100644 index 0000000000000000000000000000000000000000..d7927ff245d2bc9390699fbd01aa2f0390ea0a74 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/923229cb-d699-4cc5-b850-284707b96e12.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6d13091374282117b62e25aeea3e9caadb7b397fb6bbf58ec107effef9f0294 +size 144855 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/92afd1e9-f985-455f-86a7-d1cf7f504b01.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/92afd1e9-f985-455f-86a7-d1cf7f504b01.lance new file mode 100644 index 0000000000000000000000000000000000000000..b6a39ce5a167797df3b39319570ad1fbc59c7047 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/92afd1e9-f985-455f-86a7-d1cf7f504b01.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e82579ab25f35c9c77f44920dc478fbedbc2b9c5a205607156a298428c47b79 +size 136162 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/94762837-566c-4834-bb08-1de486e60b1b.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/94762837-566c-4834-bb08-1de486e60b1b.lance new file mode 100644 index 0000000000000000000000000000000000000000..fd6f93ebcbbf7129d262f45fdd36d61b16667069 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/94762837-566c-4834-bb08-1de486e60b1b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62125ffad2c9a5774e36506d52686fb0fe52ac63f4cb76ede3b861ad8b745841 +size 143500 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/949efbd7-e217-4e90-b7d1-b86e3646c517.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/949efbd7-e217-4e90-b7d1-b86e3646c517.lance new file mode 100644 index 0000000000000000000000000000000000000000..2622b03c3740b87705cf001c72625bcd8080c039 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/949efbd7-e217-4e90-b7d1-b86e3646c517.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8242ecf8eecc31e88e22e535c1a92d422d1f6a862154aee3515140fb2511001a +size 142085 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/960b4fc6-ee20-42e7-ac36-252827bb8592.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/960b4fc6-ee20-42e7-ac36-252827bb8592.lance new file mode 100644 index 0000000000000000000000000000000000000000..212ab337342c7d5ae9944e6992095509c3d0324c --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/960b4fc6-ee20-42e7-ac36-252827bb8592.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d1479be69c886966032f7f26d9bf2b3e0b6ac1611a067683fc22367fd305fcc +size 135503 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/96daa5c3-873d-41f8-b850-fbf17656a314.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/96daa5c3-873d-41f8-b850-fbf17656a314.lance new file mode 100644 index 0000000000000000000000000000000000000000..1fc9da625a074c8b4266d4fe258f70ee1db900a7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/96daa5c3-873d-41f8-b850-fbf17656a314.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c19410aa517734f4d1edbd0e90200b8dcfe3da2d560c172b8685a23a4ab9afa2 +size 137200 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/971d1c36-fa93-432c-81f4-92d5b0a3b819.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/971d1c36-fa93-432c-81f4-92d5b0a3b819.lance new file mode 100644 index 0000000000000000000000000000000000000000..c2efa4c101951bd0be0bb86f98bc9cce65842592 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/971d1c36-fa93-432c-81f4-92d5b0a3b819.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0080e16488dc0e6717f53adf0f08bc24578a48a52e8a62ffab37701622de1aa1 +size 131063 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/97cb48ec-257a-4285-b27e-9c363e2d70b6.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/97cb48ec-257a-4285-b27e-9c363e2d70b6.lance new file mode 100644 index 0000000000000000000000000000000000000000..cd64bd2b7913165f1c18465d644d644c3d8ff91b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/97cb48ec-257a-4285-b27e-9c363e2d70b6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e8a11ce0c59182add7039cec7e5ab3b0a87db13dffe86728bcc9c542a3de919 +size 140147 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/986191b7-8aae-4311-a1f1-31eb12222f96.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/986191b7-8aae-4311-a1f1-31eb12222f96.lance new file mode 100644 index 0000000000000000000000000000000000000000..e2e54eee54001d07ee06670a543180dfc01e4c94 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/986191b7-8aae-4311-a1f1-31eb12222f96.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40dc756211d8de53106a66b6fd6e7c09c8ca6ad94a8d6850e7a1cdb8a4092567 +size 146910 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/98663e34-b921-48f9-86fb-12e31ffeb49a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/98663e34-b921-48f9-86fb-12e31ffeb49a.lance new file mode 100644 index 0000000000000000000000000000000000000000..6206ab389abb82eb0f545e5ffc69a00f88f4f0d6 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/98663e34-b921-48f9-86fb-12e31ffeb49a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed449fbe85a005b682ed61d8a757d716bad3bea04bdcaba7c5a9c3bf904cb895 +size 138621 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/99d6080f-54a7-4c4d-955f-051d7d855750.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/99d6080f-54a7-4c4d-955f-051d7d855750.lance new file mode 100644 index 0000000000000000000000000000000000000000..d4ca21407f2ef04940b3a9e88e76c46510c6c7b7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/99d6080f-54a7-4c4d-955f-051d7d855750.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:514b61e7f6412a8fc0f73e5abc63dee6e51a49f5dc121aff0930c79bccdad24f +size 139107 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/9a324c68-6ea1-4c7f-9fc1-a49a5cbe9c83.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/9a324c68-6ea1-4c7f-9fc1-a49a5cbe9c83.lance new file mode 100644 index 0000000000000000000000000000000000000000..fff91b29b9424853be77f01ea55ff280b5113107 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/9a324c68-6ea1-4c7f-9fc1-a49a5cbe9c83.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c6be77f6ee77bcc92c8412ef75f57b2a002f747f8ffa601ff1a11bb3ac364a5 +size 144608 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/9c1bc5d0-6d0d-4b2b-9f01-bb14fe9d6e30.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/9c1bc5d0-6d0d-4b2b-9f01-bb14fe9d6e30.lance new file mode 100644 index 0000000000000000000000000000000000000000..391cd8156bafc6f92ccab447a782f5a19723b1ad --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/9c1bc5d0-6d0d-4b2b-9f01-bb14fe9d6e30.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e131f651626c8feabe7682a17a7dd424c4dfbbf1830b700aab3e1735f887a3b3 +size 150516 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/9d08d5db-0bfd-4450-aba6-0c089a635590.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/9d08d5db-0bfd-4450-aba6-0c089a635590.lance new file mode 100644 index 0000000000000000000000000000000000000000..5b36efc591394fefc6ab30e7a71f8663680bdcd2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/9d08d5db-0bfd-4450-aba6-0c089a635590.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:341da2a96cf4c199dacd6f87cff007f464484d54bcd73662453c396136e2b7fe +size 140372 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/9d11c357-87b5-450a-8d8f-97e96f4349f2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/9d11c357-87b5-450a-8d8f-97e96f4349f2.lance new file mode 100644 index 0000000000000000000000000000000000000000..19772ddaef2ea1ee602371226e07ba352eaea430 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/9d11c357-87b5-450a-8d8f-97e96f4349f2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:057ab5faee68a1aac5aa05a7af3a2475e446155e3a73f417d1266c062248f852 +size 131681 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/9d6c7db8-2b78-4985-82e2-1ae0331054cd.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/9d6c7db8-2b78-4985-82e2-1ae0331054cd.lance new file mode 100644 index 0000000000000000000000000000000000000000..30f495145d9b20b14791ddb6dddf784718eabfa9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/9d6c7db8-2b78-4985-82e2-1ae0331054cd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bceeeb0c71e5f2d4b7f5ac392a4c9a0616fed77ffbc208b68a881b24ff2c85e +size 139819 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/9ea7752f-685a-4bf3-b77e-0d852e6a2bac.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/9ea7752f-685a-4bf3-b77e-0d852e6a2bac.lance new file mode 100644 index 0000000000000000000000000000000000000000..68a074517b57cd4b098f8814a538e0e536ce7c2f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/9ea7752f-685a-4bf3-b77e-0d852e6a2bac.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a103dbb3e51290df4f79dab0bd1cdc83254960b159fa6236df641f9571f709f +size 141487 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a009e004-1bae-49fb-83c1-40b08649c4e2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a009e004-1bae-49fb-83c1-40b08649c4e2.lance new file mode 100644 index 0000000000000000000000000000000000000000..e2ba15e68dc6a0e8b583edb3aead1a4a838db1f7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a009e004-1bae-49fb-83c1-40b08649c4e2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75a4b618e029b4fcbfb1e44f114a4a9cd1231100a1e5594fc6f86609ae5e8056 +size 137185 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a066990d-6ac3-40d1-a7f7-d63fd3bdfcc7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a066990d-6ac3-40d1-a7f7-d63fd3bdfcc7.lance new file mode 100644 index 0000000000000000000000000000000000000000..7ff7605a91f8333eed9970a540b4fc8c5757d519 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a066990d-6ac3-40d1-a7f7-d63fd3bdfcc7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d99dd909a61211352f62bea46df427af81d7edd512828ea52840c591c571c6d +size 138216 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a0eed58c-5585-4432-b2c6-e2196383f409.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a0eed58c-5585-4432-b2c6-e2196383f409.lance new file mode 100644 index 0000000000000000000000000000000000000000..cbe62885e38b51d5ca00262e625d36fd1330a7f7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a0eed58c-5585-4432-b2c6-e2196383f409.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8d73a21c012f303811e0524b5c2b85e93ca82466143bd20529ca6da5c1e5fed +size 141163 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a152161a-3e90-456f-b4fc-e2243efecace.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a152161a-3e90-456f-b4fc-e2243efecace.lance new file mode 100644 index 0000000000000000000000000000000000000000..1be33eb7f7d5f4e0860a6f860a4c614c73909714 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a152161a-3e90-456f-b4fc-e2243efecace.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbfbb232505fc86baa13e0dff1efa9307fcad624a71923784b4dcbe7528e7ef1 +size 136924 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a1d28dab-1df8-4dd9-a6ff-c7ba66dc5492.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a1d28dab-1df8-4dd9-a6ff-c7ba66dc5492.lance new file mode 100644 index 0000000000000000000000000000000000000000..886b195ff505b31a5e75bb1e87800b725aab837f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a1d28dab-1df8-4dd9-a6ff-c7ba66dc5492.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d097eb4286cc2fa4a0b50795f167da21a2382c04eee319174cb59784040f7007 +size 136551 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a2787e87-14a1-4397-a110-71c0d88addb1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a2787e87-14a1-4397-a110-71c0d88addb1.lance new file mode 100644 index 0000000000000000000000000000000000000000..8c8091cf51d5bc56cee9923a2face61cedb464d2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a2787e87-14a1-4397-a110-71c0d88addb1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55350cf7945684ae6cffb9b920f50b5d69532295a00f12e08c49a10383711cb4 +size 140589 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a2863f64-6bbb-487e-8519-da8a1226cda4.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a2863f64-6bbb-487e-8519-da8a1226cda4.lance new file mode 100644 index 0000000000000000000000000000000000000000..9d3f4c28815ed373d9e40448f9bc27151bab5dd3 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a2863f64-6bbb-487e-8519-da8a1226cda4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2702c9565b1b935b4497debdc2cdd3e73656642449463ced5cc248c18837fad2 +size 140147 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a4d947bd-ce06-463a-80d9-8600986a2b26.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a4d947bd-ce06-463a-80d9-8600986a2b26.lance new file mode 100644 index 0000000000000000000000000000000000000000..e6db943a4ad4695eab18e0c7253e72e55e6b2355 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a4d947bd-ce06-463a-80d9-8600986a2b26.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db289c23938721c0d9a626de19f08c500a6c4aafe26a94fb2021d569b133215a +size 134753 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a6c1d63b-a0a9-4076-b580-89a72c253078.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a6c1d63b-a0a9-4076-b580-89a72c253078.lance new file mode 100644 index 0000000000000000000000000000000000000000..37247217ad89a33f2f69b7f4033a1b6762be07e5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a6c1d63b-a0a9-4076-b580-89a72c253078.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:133bfd2c3f696cff540a3e2cd7c0a237a517d7cca9fb042a68492da10275e6ec +size 138839 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a74c4392-191a-4d4f-ad68-c6edbd145810.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a74c4392-191a-4d4f-ad68-c6edbd145810.lance new file mode 100644 index 0000000000000000000000000000000000000000..860b9b31cb8544c7bbb342ffc0ae9527c0bd8a25 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a74c4392-191a-4d4f-ad68-c6edbd145810.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00d24ddf3a239308b53f55f1043bd500a3d12c44e630916a099ebc30d109cc09 +size 144031 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a78e28b4-14aa-42de-950e-eac89aa800a5.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a78e28b4-14aa-42de-950e-eac89aa800a5.lance new file mode 100644 index 0000000000000000000000000000000000000000..2060e0709c4ac70bcbde10982bfd148958fe7ec5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a78e28b4-14aa-42de-950e-eac89aa800a5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56a35da1b86f586ba3b1e678fd11b341c63b378e4676a3fea094695a90a750c2 +size 144284 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a8a6c8f2-f8d5-412a-bdf9-28edfc2b368c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a8a6c8f2-f8d5-412a-bdf9-28edfc2b368c.lance new file mode 100644 index 0000000000000000000000000000000000000000..cd04559200b023156ba2d998379eba18d2fa20cf --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a8a6c8f2-f8d5-412a-bdf9-28edfc2b368c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bbbe9f3f887f066acc930e821ae723dfa589c206c8c251b268a19b9dd5033c5 +size 138590 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a8baa4fd-0d33-499d-b1ab-53b1c9c353df.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a8baa4fd-0d33-499d-b1ab-53b1c9c353df.lance new file mode 100644 index 0000000000000000000000000000000000000000..f73381b2dc17daff9a362993beda40709d64a60e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a8baa4fd-0d33-499d-b1ab-53b1c9c353df.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3309f88c01f722fec7bb773e5715829f62f7486a25bf6dddaa77dbefa662044 +size 137242 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a97ce191-050e-4939-8525-222309bd1039.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a97ce191-050e-4939-8525-222309bd1039.lance new file mode 100644 index 0000000000000000000000000000000000000000..d150878dc5f8c6f340911fc155e5777d86e4c24f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a97ce191-050e-4939-8525-222309bd1039.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4839ee46be745246fdc68d405c29d74fe2d901c800f292a067094ce08570ffb +size 140439 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a98cf800-ab31-48db-8ddf-a0f279324e6c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a98cf800-ab31-48db-8ddf-a0f279324e6c.lance new file mode 100644 index 0000000000000000000000000000000000000000..6f5e1940702e3414439215936f5655ce79178b3b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a98cf800-ab31-48db-8ddf-a0f279324e6c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9ef420efad10ffb08571cb4a41152ecd268b98d146b931f363d7891378eaa88 +size 140076 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/a9cc16de-58f2-4c9e-a48a-86f5645232a2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/a9cc16de-58f2-4c9e-a48a-86f5645232a2.lance new file mode 100644 index 0000000000000000000000000000000000000000..d41d1dcc2a69628be80e86649d30dce362f478fe --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/a9cc16de-58f2-4c9e-a48a-86f5645232a2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3de31ef87828063d1f423e809fdb312eb2abc1ff3d6b1f71eaa4abae46e1b3d7 +size 142758 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/aa2944d6-5366-4e34-b955-244d12946448.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/aa2944d6-5366-4e34-b955-244d12946448.lance new file mode 100644 index 0000000000000000000000000000000000000000..0b49dfa984b8d4795cededb6995401e62dbf8b49 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/aa2944d6-5366-4e34-b955-244d12946448.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:033e836216a7eb6fd0ebaa5ad95316239dd87b66a328e742f6622879bdb32160 +size 138895 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/acfdc1d8-6bfc-47f5-88aa-ac9cfffc3eb7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/acfdc1d8-6bfc-47f5-88aa-ac9cfffc3eb7.lance new file mode 100644 index 0000000000000000000000000000000000000000..d81831bd70dabb0ac2cd9837f7beebbf838e71f7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/acfdc1d8-6bfc-47f5-88aa-ac9cfffc3eb7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e621eac1d7dd7b29fda36b606b7f5ceb7d251517e0c53e1ddddcb860e3e56f8 +size 140556 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/ae819f12-724b-4a17-9203-fa767dd9a1e5.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/ae819f12-724b-4a17-9203-fa767dd9a1e5.lance new file mode 100644 index 0000000000000000000000000000000000000000..8d5b3f4d55a9d1295733e7255b7c8b88b2b47c78 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/ae819f12-724b-4a17-9203-fa767dd9a1e5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0dd84d09b9c717d4eed3b38067050d4bf7c4df94c1db0e08638cc48318dac14 +size 139446 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/aee74fcf-c07f-4a88-93fb-c75b934af870.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/aee74fcf-c07f-4a88-93fb-c75b934af870.lance new file mode 100644 index 0000000000000000000000000000000000000000..b0a6aca17faa7639b549f1ac371910ee2e6a6482 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/aee74fcf-c07f-4a88-93fb-c75b934af870.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0eadf30cbf21664205365c82b357e928dc97c4fe13cb55a638d0fdb209384af0 +size 139700 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/af7ee935-6c60-45df-bb3a-bd86ec612bd9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/af7ee935-6c60-45df-bb3a-bd86ec612bd9.lance new file mode 100644 index 0000000000000000000000000000000000000000..9137834c2ba5e27f523e94ac098089fb9dc4c6d8 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/af7ee935-6c60-45df-bb3a-bd86ec612bd9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99065d95695d3ec8749531a37378f2986943e21dd1429eb35c9a7b10d95fc2fb +size 144179 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/afe27a94-4986-480c-90ab-6ea3ac25136e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/afe27a94-4986-480c-90ab-6ea3ac25136e.lance new file mode 100644 index 0000000000000000000000000000000000000000..9cac33b08353381a93bee6f4711cfe369e154de0 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/afe27a94-4986-480c-90ab-6ea3ac25136e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08be39018282f7dd517e486bca67e32ef7548bfbe90f8f8c785366069c203196 +size 139649 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b2bc97a0-1262-4ad9-b346-1756b4d6e4f1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b2bc97a0-1262-4ad9-b346-1756b4d6e4f1.lance new file mode 100644 index 0000000000000000000000000000000000000000..b7012f96b3576d846a7f32b6f206a111d3bcf2c8 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b2bc97a0-1262-4ad9-b346-1756b4d6e4f1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df973eea4d006fb3821ce9642aba1913084b5ddcb36582d080f0d2ade5df427 +size 139568 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b3833f64-ba02-4412-94d0-409d37e0e3c2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b3833f64-ba02-4412-94d0-409d37e0e3c2.lance new file mode 100644 index 0000000000000000000000000000000000000000..22e772b141ac111c5b3e3deb9c2aeb2c9ea2517b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b3833f64-ba02-4412-94d0-409d37e0e3c2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04d6c4cfdb46283b353ef96968ad2b0094f5522bb2a409277840f68211ac41c9 +size 147203 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b3941996-e858-42d5-9396-1d19e0b2d506.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b3941996-e858-42d5-9396-1d19e0b2d506.lance new file mode 100644 index 0000000000000000000000000000000000000000..eb8cbcd6950e8c235a4d1efee69d7a63a9487abb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b3941996-e858-42d5-9396-1d19e0b2d506.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afaf221a86c2f805c7646ea287ad7ab7c760bf093fa50f7be1ab2ce9513bf614 +size 139849 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b4127f04-8da7-48f7-8a34-9f9696e21627.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b4127f04-8da7-48f7-8a34-9f9696e21627.lance new file mode 100644 index 0000000000000000000000000000000000000000..31bea326eb0f9d9f00148f2bce36afe2864406ac --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b4127f04-8da7-48f7-8a34-9f9696e21627.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84a5ec46ed5c15c2d033e0c6e3f8c8ed594e9d88839df941f6a367a626767f3a +size 139298 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b4736b22-f968-4984-9ed2-7e4f45c43a36.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b4736b22-f968-4984-9ed2-7e4f45c43a36.lance new file mode 100644 index 0000000000000000000000000000000000000000..ac781391bc9b1243a97a780504f6a39e6b38decc --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b4736b22-f968-4984-9ed2-7e4f45c43a36.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3694487928517a16f227fde3a2b531f23587a8720d03e1097a1eae2e864ec18 +size 143824 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b50183a7-c357-446d-993a-2189edd9235f.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b50183a7-c357-446d-993a-2189edd9235f.lance new file mode 100644 index 0000000000000000000000000000000000000000..f3565be50624cacc4a3b82de098f97412b174aa8 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b50183a7-c357-446d-993a-2189edd9235f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffdf68bf4e53fd26e43a9a1fdd7335fd3f993aae8f9bc60343f0e95930bc7999 +size 141846 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b502b0f7-8687-486c-bc60-c948b82ee437.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b502b0f7-8687-486c-bc60-c948b82ee437.lance new file mode 100644 index 0000000000000000000000000000000000000000..57adc6f928ee439e0859e953e2a37177e7bbe842 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b502b0f7-8687-486c-bc60-c948b82ee437.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e90fdf8cf7717cf79c6800c5318cb59798cf140fa34d02a15993d321a0f5d192 +size 140768 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b5295612-dc8a-4d6b-a941-bc70615dc581.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b5295612-dc8a-4d6b-a941-bc70615dc581.lance new file mode 100644 index 0000000000000000000000000000000000000000..da5a56fea2cc08e7299e1295a046606e68a215d6 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b5295612-dc8a-4d6b-a941-bc70615dc581.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f9cb9656eca3a2075134b51fe98d1307fef51cb85749b9cc35823b738de2014 +size 139330 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b530828d-0528-4799-b031-bcb8eba9d8a0.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b530828d-0528-4799-b031-bcb8eba9d8a0.lance new file mode 100644 index 0000000000000000000000000000000000000000..b7266824e868973cee8e8c8e66f945644dcb9814 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b530828d-0528-4799-b031-bcb8eba9d8a0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7897f77a814befe9bb4c07b3ca34fa5b1a4efa2372a3c1d1b41bf4d2525e95c +size 109878 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b533fcdd-69e9-46df-95de-4b58900d7ae2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b533fcdd-69e9-46df-95de-4b58900d7ae2.lance new file mode 100644 index 0000000000000000000000000000000000000000..c6a5ec8a4b738394284f103894e3ec4d1e32cb23 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b533fcdd-69e9-46df-95de-4b58900d7ae2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ed7330db705db855623096df46f187190102420598aeb2c6314f18b091091d8 +size 144230 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b619cd43-f5fd-4a63-a744-70fa18813295.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b619cd43-f5fd-4a63-a744-70fa18813295.lance new file mode 100644 index 0000000000000000000000000000000000000000..671c6f0f033eb19e5be1d769da72b249126c7405 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b619cd43-f5fd-4a63-a744-70fa18813295.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a70e1ed461832421c547bf11c32537d9331d8a04e85ff7a74fa332c215b7b583 +size 142836 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b7072551-867c-4cf3-b854-a147a993747a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b7072551-867c-4cf3-b854-a147a993747a.lance new file mode 100644 index 0000000000000000000000000000000000000000..7271421b2018525d715bde984eea25be860c9a4c --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b7072551-867c-4cf3-b854-a147a993747a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67c7501e66e64ef3e2cbfb9224183c8339554fc106ff13d5f13dab19b925bef8 +size 140371 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b79508f5-1b03-4fe3-95e1-cb0db8b32e85.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b79508f5-1b03-4fe3-95e1-cb0db8b32e85.lance new file mode 100644 index 0000000000000000000000000000000000000000..2f042678dcfcf5f0763ab1b4a5015aa0a19be6b9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b79508f5-1b03-4fe3-95e1-cb0db8b32e85.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:754ad7d916da2438792776e0e22a4607bc9fb5c384202989ac960fd9b4ed307d +size 142377 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b7e6a027-617e-43b8-a6e3-42ba13dd35f1.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b7e6a027-617e-43b8-a6e3-42ba13dd35f1.lance new file mode 100644 index 0000000000000000000000000000000000000000..edead34aa91ce3aed0f441247105057d3eb090be --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b7e6a027-617e-43b8-a6e3-42ba13dd35f1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8f809094ce2bc0f906e11ef55f076444208b0b771ce9495f6ee05d6aaf9f535 +size 138559 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b889259f-8349-445f-a5fa-344cc267c859.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b889259f-8349-445f-a5fa-344cc267c859.lance new file mode 100644 index 0000000000000000000000000000000000000000..38f1564b466917598424bf057b848f6a50f6eaa7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b889259f-8349-445f-a5fa-344cc267c859.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28ab5f2eac0b380c997d15f39e2c10758b3ca3e6beb68076a1e099b1a2ca59af +size 136150 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/b9bf4dae-a211-4e07-a59a-a3da693cf208.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/b9bf4dae-a211-4e07-a59a-a3da693cf208.lance new file mode 100644 index 0000000000000000000000000000000000000000..dc98f80e831c2c3683e036a39faee4d31fbaed8d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/b9bf4dae-a211-4e07-a59a-a3da693cf208.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a308c28dee94c227dc14f0fb1274376804c49a275fa8c730b3996ce692dd2f8f +size 136564 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/ba40af82-a372-4122-9a16-2a3eb98b8f37.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/ba40af82-a372-4122-9a16-2a3eb98b8f37.lance new file mode 100644 index 0000000000000000000000000000000000000000..327ca464f584aa4d5eb130dc2372ac6c56aef805 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/ba40af82-a372-4122-9a16-2a3eb98b8f37.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2c2b63e65ecbb70f58c6bf36a8f84c291cfd2e429f134a910328d48d7dfb381 +size 140511 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/bb9df738-c066-46b8-9a51-3b4a8ff5d2bc.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/bb9df738-c066-46b8-9a51-3b4a8ff5d2bc.lance new file mode 100644 index 0000000000000000000000000000000000000000..5bf1ba5899c2f07db5efb14200e0a989929785f0 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/bb9df738-c066-46b8-9a51-3b4a8ff5d2bc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eecfd537a6adfff929de61c774039003d4f3f7937686ee9241b216ea84fdc86 +size 144978 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/bbe9157b-c661-419d-9e7f-d4be3eb34f9a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/bbe9157b-c661-419d-9e7f-d4be3eb34f9a.lance new file mode 100644 index 0000000000000000000000000000000000000000..88eeb7476416bfba4f7b49a2228385cac1a756aa --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/bbe9157b-c661-419d-9e7f-d4be3eb34f9a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59bd0228e0abc26b749a1557de21b94f0415197d6b421dc6939aebaac82bed72 +size 144168 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/bc87d85a-55e2-489e-b8d5-6041cfba693c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/bc87d85a-55e2-489e-b8d5-6041cfba693c.lance new file mode 100644 index 0000000000000000000000000000000000000000..31e04302a3d99a756a1b8a9cf926a758eb880302 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/bc87d85a-55e2-489e-b8d5-6041cfba693c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe34ad768226b310ada8c849c5e86e3c5b852ec8d3d269cabc1251af53238be1 +size 143354 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/bcff3198-47c6-4ccb-9c5f-6ee500b128e7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/bcff3198-47c6-4ccb-9c5f-6ee500b128e7.lance new file mode 100644 index 0000000000000000000000000000000000000000..9b374ba69cdb24b8c6e475d1bd5016082782a530 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/bcff3198-47c6-4ccb-9c5f-6ee500b128e7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38c43891c3eaf9d48079df18a3ed6f2df9b086751efb4d7ae9b05ef6612d272e +size 140784 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c0ac2ac3-6402-4f99-b9d8-658d708ba6d9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c0ac2ac3-6402-4f99-b9d8-658d708ba6d9.lance new file mode 100644 index 0000000000000000000000000000000000000000..3ea2dc87818098ae90025e68651fd301a26505e5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c0ac2ac3-6402-4f99-b9d8-658d708ba6d9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b057c3a650ed0c5961f18adebb45992801b1a0bc1a36fd4de97c15b3d9a06d5 +size 137703 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c1025142-a43a-44ea-bcf8-0dbc260222c0.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c1025142-a43a-44ea-bcf8-0dbc260222c0.lance new file mode 100644 index 0000000000000000000000000000000000000000..e1286106e2afc7044282b477ebc9fe440c00d0b2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c1025142-a43a-44ea-bcf8-0dbc260222c0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:028811cd8b2a382fed1c89e3c25081f8b15101a7b0419fd6d87919e8763ac3e0 +size 141980 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c1073f9a-de9e-410a-b68f-d000cd8b6419.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c1073f9a-de9e-410a-b68f-d000cd8b6419.lance new file mode 100644 index 0000000000000000000000000000000000000000..f4d833764225eda2e43f1abb613617bef9066536 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c1073f9a-de9e-410a-b68f-d000cd8b6419.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eaca9cd7a3693a12d99eda6e06f29f2ce1976cbe257dbad598685b4f99c3088 +size 137123 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c19cf4f5-0aec-4124-875a-adfa3ed3cabf.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c19cf4f5-0aec-4124-875a-adfa3ed3cabf.lance new file mode 100644 index 0000000000000000000000000000000000000000..05a0675a563bc308a0cc7e4762831c3faf76e6dd --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c19cf4f5-0aec-4124-875a-adfa3ed3cabf.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2f89317d2fc8a99567aa76c9a64b498a216db9a5340f04e450a8518584c88a3 +size 145718 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c279c516-4aa8-4acd-a4dc-f2fb0b051192.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c279c516-4aa8-4acd-a4dc-f2fb0b051192.lance new file mode 100644 index 0000000000000000000000000000000000000000..e02f689d8c5b08291a0e4198c0932d4c174abceb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c279c516-4aa8-4acd-a4dc-f2fb0b051192.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abd5f220652950ffeb72bee5046e9ab5d8e2406e09fbaae0ad1fb697f8a0ed2e +size 143943 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c2a8311c-af87-43da-8b4a-e48db4a13404.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c2a8311c-af87-43da-8b4a-e48db4a13404.lance new file mode 100644 index 0000000000000000000000000000000000000000..64389b0b5c3553621b4c0abdfda749fa683d9a85 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c2a8311c-af87-43da-8b4a-e48db4a13404.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a2a272b60c0fa9a78624c9c638d1e3de7525f5970494debf0e2e5e1d06b05fb +size 140729 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c2c50021-0a69-4b33-9c22-79708805d004.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c2c50021-0a69-4b33-9c22-79708805d004.lance new file mode 100644 index 0000000000000000000000000000000000000000..44e98c9e8b2b5b11bf647e26ac8ed70df138869d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c2c50021-0a69-4b33-9c22-79708805d004.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dcbcbf9f15713640cbdec4d5178a923e31aeda0dc6cb34d6e4e3db9bac1415e +size 160126 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c315d1a0-7dfb-4558-9060-db778cdbc650.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c315d1a0-7dfb-4558-9060-db778cdbc650.lance new file mode 100644 index 0000000000000000000000000000000000000000..13e5d9779624b1fa9e7f7288986ef0fb8fdfdcf3 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c315d1a0-7dfb-4558-9060-db778cdbc650.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d907daefcca36bef9775e47dc00c44387cf662a128d31e331b235420dee2057 +size 143359 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c3177080-614f-4a8b-a195-ff0cce1ecdcb.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c3177080-614f-4a8b-a195-ff0cce1ecdcb.lance new file mode 100644 index 0000000000000000000000000000000000000000..6a65b28fe6fe43c07c5c12aa8e341fe8cbed26e2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c3177080-614f-4a8b-a195-ff0cce1ecdcb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06a4f3a29f3efc54a128cfa361f4f62245e20cb4c0fe3e043bb1df68add0db75 +size 161182 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c37880ed-5eed-46dd-9377-e5512e74246a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c37880ed-5eed-46dd-9377-e5512e74246a.lance new file mode 100644 index 0000000000000000000000000000000000000000..80e75e20e367a5203d27a62781981e3659956847 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c37880ed-5eed-46dd-9377-e5512e74246a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:974bfd5ff086eb7605cc1e784de59e0fdfaf1f7c3e0538f3eea30296e29edda6 +size 145205 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c37e3d07-73ef-4359-becc-a56c81e7273e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c37e3d07-73ef-4359-becc-a56c81e7273e.lance new file mode 100644 index 0000000000000000000000000000000000000000..ab2c16f71884387229e5d9f698c9a6cd4a0cf43d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c37e3d07-73ef-4359-becc-a56c81e7273e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c9e2c526f0a72f785cff3d641e3d51931f2315a068e61d3d3f5e207e18f5586 +size 137334 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c3ea3e30-b9b5-47d9-bcd4-8621c3e68939.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c3ea3e30-b9b5-47d9-bcd4-8621c3e68939.lance new file mode 100644 index 0000000000000000000000000000000000000000..cc5ee2fa20d931c837fb2af60250f46854abe3d0 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c3ea3e30-b9b5-47d9-bcd4-8621c3e68939.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:095cb968fad1d8e45f6c203d3ecd751388224316a207138c5181a313a2198bbe +size 136410 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c46c20b0-ba52-4112-8f7b-c78019dad71c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c46c20b0-ba52-4112-8f7b-c78019dad71c.lance new file mode 100644 index 0000000000000000000000000000000000000000..c3c756b601ddfda17f8f7b87d4a1ddf408441bf4 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c46c20b0-ba52-4112-8f7b-c78019dad71c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b49cc52c953989e25e70532e5d296143957690061af47d235d993d5b74504c1c +size 141481 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c46d9b53-746c-48dd-a527-5125835a8ed8.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c46d9b53-746c-48dd-a527-5125835a8ed8.lance new file mode 100644 index 0000000000000000000000000000000000000000..a389eaeec74ce75230302b7e467a9d36e975ffa9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c46d9b53-746c-48dd-a527-5125835a8ed8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:734f76a2b4960b07343047e17e7c2ad1e5b5f0fd62eccd7930a54c9b34c2ddee +size 142049 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c569dd0d-6781-4278-bb94-9200fd17297c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c569dd0d-6781-4278-bb94-9200fd17297c.lance new file mode 100644 index 0000000000000000000000000000000000000000..048891f83f9cb893dcae376f8e6c88824f78723f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c569dd0d-6781-4278-bb94-9200fd17297c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:938ea1a7e5cfe5f2a6e69e33761910918167657e8eedecfe3ec285a9662fdae1 +size 139990 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c5bc787a-a721-4c07-be32-fbabe64f283f.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c5bc787a-a721-4c07-be32-fbabe64f283f.lance new file mode 100644 index 0000000000000000000000000000000000000000..39b6c0046fb2897f3445c7f8b0d6e627a0fbf488 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c5bc787a-a721-4c07-be32-fbabe64f283f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c78026104bc209699475b7b63dba0f4c55373d435577f9c8103d0cd532e932e8 +size 139100 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c6ddc231-96c2-42ee-ab69-9049ee776e39.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c6ddc231-96c2-42ee-ab69-9049ee776e39.lance new file mode 100644 index 0000000000000000000000000000000000000000..3b5a94ea793cda66931afe29c24bbc28c5b7d1b2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c6ddc231-96c2-42ee-ab69-9049ee776e39.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:703f1a9815f8db0b22c07e8d90d6d03a311bebc424ff5bfa5b1d95bd99770fab +size 142924 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c6f27772-d026-42cf-a37d-62b45112a955.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c6f27772-d026-42cf-a37d-62b45112a955.lance new file mode 100644 index 0000000000000000000000000000000000000000..db31f0a1bc0bd87a36708b6c9d4adc97ac70dc18 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c6f27772-d026-42cf-a37d-62b45112a955.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:262a153ba69814dbdfcc9d267cdecbe13b4ef0d7897a2b8c1be1de4764078265 +size 140543 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c6f49ba3-5fe6-49d9-b402-d065cdda4c63.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c6f49ba3-5fe6-49d9-b402-d065cdda4c63.lance new file mode 100644 index 0000000000000000000000000000000000000000..1506ff28d2eb3a56d09f14a918b6ebee4c3d5299 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c6f49ba3-5fe6-49d9-b402-d065cdda4c63.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d076ca71b70589b3f9c8694e670a77f4d1803cca98876fc77d3abf5f39fa9d3 +size 143649 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c7cec91c-9a58-4ec5-a4b8-ace93c47013f.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c7cec91c-9a58-4ec5-a4b8-ace93c47013f.lance new file mode 100644 index 0000000000000000000000000000000000000000..93a950900f3f7f92b55fe7d997f3d7c659cf7274 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c7cec91c-9a58-4ec5-a4b8-ace93c47013f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8aa56c78500d628287554fe3662fdb29002b28e972fb7aee15c8a58b80e53f2 +size 148902 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c7d092fa-6ea6-4b2e-b672-3632fc330416.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c7d092fa-6ea6-4b2e-b672-3632fc330416.lance new file mode 100644 index 0000000000000000000000000000000000000000..08c57ccd3b7dfcda3b9c2c617be0782cc1abd0ba --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c7d092fa-6ea6-4b2e-b672-3632fc330416.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dbe270ad061ab9bca89241ea1d0249a238286963851623bfeb85b48f0da96ae +size 137768 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c7e3b9dd-9b21-4f13-9794-dd31d578e566.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c7e3b9dd-9b21-4f13-9794-dd31d578e566.lance new file mode 100644 index 0000000000000000000000000000000000000000..f3615843658395de05f8f75b327f60e861c015d4 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c7e3b9dd-9b21-4f13-9794-dd31d578e566.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e18f5230b695661e887bf745d98ab0856c2651c1ecf74144e1f4c2f02fcd143 +size 143096 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c81f90d7-d31f-4e11-bc95-982d757de079.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c81f90d7-d31f-4e11-bc95-982d757de079.lance new file mode 100644 index 0000000000000000000000000000000000000000..2da48e55f322582f0107fa7908c57b00d1da2fe5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c81f90d7-d31f-4e11-bc95-982d757de079.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4093cd9b15c9eabdc03e8cd60ae2b7399f360d165415fc3b5d4a3dc30296ee4c +size 138595 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c95294f6-87ac-4415-b0c4-787a7ad52b3c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c95294f6-87ac-4415-b0c4-787a7ad52b3c.lance new file mode 100644 index 0000000000000000000000000000000000000000..976db0dc650af2aeea00015bdbff4926a58a1b32 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c95294f6-87ac-4415-b0c4-787a7ad52b3c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adbbd375fc9b7cbf20602194755cc8f7e5a945bdb15bcc24e58b4bb9e969fc14 +size 142808 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/c9bf175c-5a5e-4f10-9b8e-2c5ecec5a728.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/c9bf175c-5a5e-4f10-9b8e-2c5ecec5a728.lance new file mode 100644 index 0000000000000000000000000000000000000000..aa882d8c85c2dcb9cf6bea829c4879fe731b0b7e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/c9bf175c-5a5e-4f10-9b8e-2c5ecec5a728.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3f415d7c5a83a25a8cc575fff8ca865d2b61038affbfb2b528c2b178a1f26c7 +size 140694 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/ca2d18c4-f5b3-44bd-8ac1-6d8fc273f858.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/ca2d18c4-f5b3-44bd-8ac1-6d8fc273f858.lance new file mode 100644 index 0000000000000000000000000000000000000000..a39a8fcd4acdbd1d2c2090e4411fe10054b44944 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/ca2d18c4-f5b3-44bd-8ac1-6d8fc273f858.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c67b251ced5c1ca4ab3e5781bda0a6f17f5f0f4f76d1645f8fefba3e2db5732 +size 143909 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/cc9ceacc-9839-4b56-a295-3868e88e22cc.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/cc9ceacc-9839-4b56-a295-3868e88e22cc.lance new file mode 100644 index 0000000000000000000000000000000000000000..e863205c75a89686e3e244b6608a42386fe4211e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/cc9ceacc-9839-4b56-a295-3868e88e22cc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:830fb03401f8a9fbd2dd68159f42e4f5840edea2caad6c46e2e68fe976108e62 +size 143220 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/cca04e29-14eb-41b7-93cb-d1ec4ae9aa9d.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/cca04e29-14eb-41b7-93cb-d1ec4ae9aa9d.lance new file mode 100644 index 0000000000000000000000000000000000000000..d9899552ff7e274427c026d23711ed40ac973fc2 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/cca04e29-14eb-41b7-93cb-d1ec4ae9aa9d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58e032e12bc86977dd7cd40b05e70bd95cad1b49de431dbeeb1e5e8a3f3ddfdf +size 135578 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/cca3ca72-7cbc-468e-aa46-9b8b22dadc38.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/cca3ca72-7cbc-468e-aa46-9b8b22dadc38.lance new file mode 100644 index 0000000000000000000000000000000000000000..bd7d490418a61bba8536e30cb2930a1eda490a1e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/cca3ca72-7cbc-468e-aa46-9b8b22dadc38.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8b7bb8a6f803982a5140cd27b844054ecdb1db3f5d7c810077c8bc3d74e22e9 +size 143565 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/cf1c20b9-977d-4fdd-b1fd-68eea1d9fcad.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/cf1c20b9-977d-4fdd-b1fd-68eea1d9fcad.lance new file mode 100644 index 0000000000000000000000000000000000000000..dcb420a26a27bd9ac8fb735a5e862f5dd1e36257 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/cf1c20b9-977d-4fdd-b1fd-68eea1d9fcad.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d785994522baeb6ccbe3eae955a5b7425590dff40ae294520a3ff0a9c5b85d7 +size 140053 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/cf625d7e-7da8-44fc-aa69-a7e14c544a1a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/cf625d7e-7da8-44fc-aa69-a7e14c544a1a.lance new file mode 100644 index 0000000000000000000000000000000000000000..4295f6500da7472cf8f1facd241dc2755c219bbb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/cf625d7e-7da8-44fc-aa69-a7e14c544a1a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06e9a302de67cd1f888458a820d5a9f0235c2f557ada8816e383fa73667d4c99 +size 139286 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/cfb5b746-a84d-4ab9-8445-edb22ef9e03a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/cfb5b746-a84d-4ab9-8445-edb22ef9e03a.lance new file mode 100644 index 0000000000000000000000000000000000000000..522aea4a2f4283459d0817c673fd137f556e17ec --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/cfb5b746-a84d-4ab9-8445-edb22ef9e03a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9f7619a2bcf75f7dadd4cde7b556634a594fcc3059d6bea9c550e0f50939a53 +size 136183 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/cfdc9a59-5661-43df-b422-97eacade5d93.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/cfdc9a59-5661-43df-b422-97eacade5d93.lance new file mode 100644 index 0000000000000000000000000000000000000000..226a75a27ba3af88162a839aba0a8d5bf655c411 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/cfdc9a59-5661-43df-b422-97eacade5d93.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a65d4a9fbc4952e0ce7bbcf37f3a6c683c8d4f38ae2925b8cbd3e0465bfa9f0 +size 154715 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d01fd43f-c89e-4db5-ae41-2d260e46ad50.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d01fd43f-c89e-4db5-ae41-2d260e46ad50.lance new file mode 100644 index 0000000000000000000000000000000000000000..547a18d71fa3b14d1fdb7429d77584ebafa49abe --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d01fd43f-c89e-4db5-ae41-2d260e46ad50.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adacbcf4e85bbf8018b55122572cd658658a588cf3556677bd9fc2a714221280 +size 146216 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d05b5f1e-a331-4ca1-9e7c-9e971b40ee7b.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d05b5f1e-a331-4ca1-9e7c-9e971b40ee7b.lance new file mode 100644 index 0000000000000000000000000000000000000000..c7744f11829b3590d89bda56e259396fb0195406 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d05b5f1e-a331-4ca1-9e7c-9e971b40ee7b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3683c82c661d95a9876e1231d53dd7c7b53527d83e90a15fbffd81d3a41f7c4 +size 141426 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d0ec4874-dd86-4205-9b07-b874a1c1561b.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d0ec4874-dd86-4205-9b07-b874a1c1561b.lance new file mode 100644 index 0000000000000000000000000000000000000000..049058f89e77bc53c69e6da8af29916ecbb44c77 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d0ec4874-dd86-4205-9b07-b874a1c1561b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f6321416937d962f9843b4b33f0ee9b7f81fffc43a47e35541f014f044aafaf +size 136251 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d1aa5cc7-c3e8-434f-8eb6-bc73433684ed.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d1aa5cc7-c3e8-434f-8eb6-bc73433684ed.lance new file mode 100644 index 0000000000000000000000000000000000000000..561acaa97c845890ac5a24261f66c91e547b2ad6 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d1aa5cc7-c3e8-434f-8eb6-bc73433684ed.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a1c3bbb9c50bc6d6e00716c178f560b7d2cbb62eff017673fcc66d7e0aa23ae +size 148660 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d22e08e0-db52-45b7-8ed5-cac2bb2bcdf8.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d22e08e0-db52-45b7-8ed5-cac2bb2bcdf8.lance new file mode 100644 index 0000000000000000000000000000000000000000..d28c06a20e7eca8fc828201676884690e24c5b3f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d22e08e0-db52-45b7-8ed5-cac2bb2bcdf8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2582061647556a6db29b848a7464f4b39cc3c62733f558e7a0925f268dec443 +size 134813 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d304e1f7-57f5-4387-90ee-57923a799c2e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d304e1f7-57f5-4387-90ee-57923a799c2e.lance new file mode 100644 index 0000000000000000000000000000000000000000..e9905d14610c5f57c95e432c86b41a540a049a7d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d304e1f7-57f5-4387-90ee-57923a799c2e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5303561e98d0814f0575a71602a62c1f59df56da90c75e011ec51ee6541853ec +size 136954 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d3d8c657-806e-4fe7-b214-f588389a89b8.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d3d8c657-806e-4fe7-b214-f588389a89b8.lance new file mode 100644 index 0000000000000000000000000000000000000000..d53c2b66a73fd7c9e56c9cd3877eb4f20d9fdc97 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d3d8c657-806e-4fe7-b214-f588389a89b8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9369a1d99d92a50d77188b11064093fb34e379444853ab0665645a4b597f1de +size 133616 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d5e2f4b0-e090-4c85-aec5-97ff19507527.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d5e2f4b0-e090-4c85-aec5-97ff19507527.lance new file mode 100644 index 0000000000000000000000000000000000000000..328ced8760db8ad55551253a245779c0fe69f396 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d5e2f4b0-e090-4c85-aec5-97ff19507527.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49cde0820d88a0dca44098441bd6cf4d3d6ae72ea9d5cb5cba646c8e550cceb0 +size 140360 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d63fc02a-3d7a-4222-b70d-8fffae381589.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d63fc02a-3d7a-4222-b70d-8fffae381589.lance new file mode 100644 index 0000000000000000000000000000000000000000..1deddf7d404c0917f188f6a4b6a5010d679a13fb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d63fc02a-3d7a-4222-b70d-8fffae381589.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e927881e118859a9a2527ec7d9bf94fc5021171de77d78121e15ee266a91c98e +size 141677 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d721d577-b67d-4394-b4bd-36e844e3e997.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d721d577-b67d-4394-b4bd-36e844e3e997.lance new file mode 100644 index 0000000000000000000000000000000000000000..8bb42690398af8c1706ea2ffcce6ee53541df807 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d721d577-b67d-4394-b4bd-36e844e3e997.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a53a53455b49c9287ff6a50674c9a5a653bcf5bd308e86553f491906fed78cb3 +size 141072 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d77b2049-66a0-4047-ae61-8d9e4880d7ea.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d77b2049-66a0-4047-ae61-8d9e4880d7ea.lance new file mode 100644 index 0000000000000000000000000000000000000000..7a2e80ea1d2e78341003c995316ae1c3a1a0a248 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d77b2049-66a0-4047-ae61-8d9e4880d7ea.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81747c3fc7ecaa2043f30be53097943ddf23228411a047f4c96ed0db17b34508 +size 141128 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/d8cbeb28-8cc0-49a1-9887-53012a4456f7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/d8cbeb28-8cc0-49a1-9887-53012a4456f7.lance new file mode 100644 index 0000000000000000000000000000000000000000..111352a171cbe62efedc1d13ceb5c951b69f8ae5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/d8cbeb28-8cc0-49a1-9887-53012a4456f7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb5a863a3581887055ad553f01c118ecca1adc3bc9e5a23c507525f290966d0b +size 138751 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/db2331eb-e697-470d-8aee-eed0ea922a0a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/db2331eb-e697-470d-8aee-eed0ea922a0a.lance new file mode 100644 index 0000000000000000000000000000000000000000..452469efda7196d40f4f1fb9f002efd4c6e1f50f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/db2331eb-e697-470d-8aee-eed0ea922a0a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03f43558c9781a0ca49ba23f619ef6776f5d7a9e1f7bd38444542031cbf613b2 +size 143310 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/db6cc40c-b746-4398-a481-755000487774.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/db6cc40c-b746-4398-a481-755000487774.lance new file mode 100644 index 0000000000000000000000000000000000000000..0a8cacc49f544032406fabf2cca171ef5719afa7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/db6cc40c-b746-4398-a481-755000487774.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a8638df0b80029dce7b938e741dedf72d148e5fde354883d9a55a3d461c1cb9 +size 140162 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/dc15c2ba-da2d-4ec8-a2e9-dbbe4427f343.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/dc15c2ba-da2d-4ec8-a2e9-dbbe4427f343.lance new file mode 100644 index 0000000000000000000000000000000000000000..c85faaca460804a142922dbe6c2bb85fa069338b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/dc15c2ba-da2d-4ec8-a2e9-dbbe4427f343.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4cf9ccfde2d6e4f03fae5b34a87292fbb2bf475e451a39e8f1a73e9cf434b11 +size 144776 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/dd478cd1-6dad-4591-9058-f48171696375.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/dd478cd1-6dad-4591-9058-f48171696375.lance new file mode 100644 index 0000000000000000000000000000000000000000..d18edc638ab71cba825c7cf260a052b7ea02e40a --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/dd478cd1-6dad-4591-9058-f48171696375.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c7197dde455f735fe1702ee4166fdd70227cbf4a7543582206d0ae8dbe8c01c0 +size 139858 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/dd7669ad-9f3a-4deb-82a9-ecceb85ad10f.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/dd7669ad-9f3a-4deb-82a9-ecceb85ad10f.lance new file mode 100644 index 0000000000000000000000000000000000000000..8afb00696e0dbd71892e4d25a0a62a127ebe0fcc --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/dd7669ad-9f3a-4deb-82a9-ecceb85ad10f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bac4709f1bf35a2858d6c969242debe3f0fb83e908898d944de5c3cb7d17b89e +size 144252 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/df0a07ff-14b8-4f4b-afa8-ea7c52fa50d9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/df0a07ff-14b8-4f4b-afa8-ea7c52fa50d9.lance new file mode 100644 index 0000000000000000000000000000000000000000..5018e34df159da0e829888cce6d1e88ee6dd6117 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/df0a07ff-14b8-4f4b-afa8-ea7c52fa50d9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eff61e7a35ef44c98c42788763a50e6aa29a0103158b3413c2ccab0f2fbe5bf3 +size 145968 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/df3901af-2b72-449b-9cf2-f4997288232e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/df3901af-2b72-449b-9cf2-f4997288232e.lance new file mode 100644 index 0000000000000000000000000000000000000000..db517a7c81eb64d483d060ff9c4ee9815a33c3f9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/df3901af-2b72-449b-9cf2-f4997288232e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d5d37581802db7ed887d6bf56593340161b4f88da7e8819b826f89a16892a65 +size 145376 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e0854466-b751-490d-be8e-122920924c77.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e0854466-b751-490d-be8e-122920924c77.lance new file mode 100644 index 0000000000000000000000000000000000000000..793d86e846a422e4bd176f6f2d107f85477c626e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e0854466-b751-490d-be8e-122920924c77.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8894a808894043a6172cf2c3370f4303e054f4259f94f4a9ef6be2f58cc4414f +size 146136 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e2381621-dffc-48ad-a478-2ba4aae3e6c2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e2381621-dffc-48ad-a478-2ba4aae3e6c2.lance new file mode 100644 index 0000000000000000000000000000000000000000..a091f2950e31e93534d2382f24fdd6157b8f78da --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e2381621-dffc-48ad-a478-2ba4aae3e6c2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:085cc57337e1652ccd322ab974724d906cc68cb5c2cc1c48665e13daceb9134d +size 151579 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e2f2c46e-2288-4c00-a62e-cfe2cb7ed2eb.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e2f2c46e-2288-4c00-a62e-cfe2cb7ed2eb.lance new file mode 100644 index 0000000000000000000000000000000000000000..29decf20d5b91cd584d3316a46ee5dbd48316187 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e2f2c46e-2288-4c00-a62e-cfe2cb7ed2eb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaa0aa012781078d3d46b6003fcaafa3f50e4bb3658bd7decc770c55566557eb +size 145470 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e40f707a-9ade-48b7-a474-2379efa7dc93.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e40f707a-9ade-48b7-a474-2379efa7dc93.lance new file mode 100644 index 0000000000000000000000000000000000000000..3e2ec9c889dd7b829e43f8f789e8bc2513e71cce --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e40f707a-9ade-48b7-a474-2379efa7dc93.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca1496d6c5da64cc719e8b4a13c14c90362fbc91b2e428789e1f002911924907 +size 138154 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e4f1b678-c0db-4a07-9b22-684f0c567255.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e4f1b678-c0db-4a07-9b22-684f0c567255.lance new file mode 100644 index 0000000000000000000000000000000000000000..7ab60248c4715351c280217ee8af6a66f8d60a6d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e4f1b678-c0db-4a07-9b22-684f0c567255.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c6e2e97aa203fb71095e9ae269a87f9293da11abddc568dd8f98e99caf25243 +size 143926 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e4f320b6-bf66-4b7f-a204-3a1a105aec17.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e4f320b6-bf66-4b7f-a204-3a1a105aec17.lance new file mode 100644 index 0000000000000000000000000000000000000000..908172ec8178da3faa8f65d4c9e3319968152c25 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e4f320b6-bf66-4b7f-a204-3a1a105aec17.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c24f6e6c522f9e3def3d8422da8cfa1eef49d9e40c7e5445ed42025cbbf09f57 +size 142741 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e51dd02c-b8d2-4954-9898-983019748544.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e51dd02c-b8d2-4954-9898-983019748544.lance new file mode 100644 index 0000000000000000000000000000000000000000..9e9d8b21288729caa1978b2bf9b9b6cdd965bbb1 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e51dd02c-b8d2-4954-9898-983019748544.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce9a4e7351a19baf89e12ca8f5006ef1d9f95bf7228f8a6c2c12ce99572acbba +size 138632 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e5316885-0f0f-4de5-9818-dabf152b2e7d.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e5316885-0f0f-4de5-9818-dabf152b2e7d.lance new file mode 100644 index 0000000000000000000000000000000000000000..278a9cf58482923fe5e9ff8583d44bab4965b53b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e5316885-0f0f-4de5-9818-dabf152b2e7d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc69aeebbf196169278ac1de2db5fe5c25d346a3e8a684bb80d500119ea1c5ce +size 139942 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e58ae861-38dd-42c9-bf8a-79fa235e859c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e58ae861-38dd-42c9-bf8a-79fa235e859c.lance new file mode 100644 index 0000000000000000000000000000000000000000..9adc158126485f03f17d9f3612fe3ced51d28c22 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e58ae861-38dd-42c9-bf8a-79fa235e859c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d97a1e3ac5ca9db19fb6b8957e8ed175a0d73fcf5b92a929701ed7b234e634b +size 137991 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e5de67b2-aec4-4671-bf71-6a177f31d17c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e5de67b2-aec4-4671-bf71-6a177f31d17c.lance new file mode 100644 index 0000000000000000000000000000000000000000..6ee6716abe2a5dd1caa0760f5d398557ade0a9b7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e5de67b2-aec4-4671-bf71-6a177f31d17c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f038d77ec25a724b866358015fe0e950cfaa22ef3616514af30b741188acbc73 +size 147382 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e64f520c-3e58-4f83-b5aa-378786907b2c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e64f520c-3e58-4f83-b5aa-378786907b2c.lance new file mode 100644 index 0000000000000000000000000000000000000000..e7be1553724c41ffaed213c8a837545940c9d961 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e64f520c-3e58-4f83-b5aa-378786907b2c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:723c46f4b0513ce368ae29e2a9314639594d7b8a9dae357d678f8f7a4f783dae +size 137236 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e68798ae-3b27-4e81-822f-f2bcb41735a9.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e68798ae-3b27-4e81-822f-f2bcb41735a9.lance new file mode 100644 index 0000000000000000000000000000000000000000..b6ee433defc681e767822f7ce963ff9fd7bd8c80 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e68798ae-3b27-4e81-822f-f2bcb41735a9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5901372fdb4506c5b322149cc73ec8e287896eb7743d8f963f0c8844c5e05e53 +size 142435 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e6a02e8c-ee87-451a-9269-3e77e38bbe6b.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e6a02e8c-ee87-451a-9269-3e77e38bbe6b.lance new file mode 100644 index 0000000000000000000000000000000000000000..a1fe525044dc3c6a598760102002a6a098807602 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e6a02e8c-ee87-451a-9269-3e77e38bbe6b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939d336588c0b7197af7777e735ef418d8dd9375c8eb7e47fff200188b669a6a +size 143126 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e77fad4b-5a63-4180-bf33-f0c801d35ad5.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e77fad4b-5a63-4180-bf33-f0c801d35ad5.lance new file mode 100644 index 0000000000000000000000000000000000000000..dca9475eaa0da58f35f285cbb439e6cbbae2db6f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e77fad4b-5a63-4180-bf33-f0c801d35ad5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4dcc9fbc244305369a384872af7b55800cef8dd8236667213feccb9d6b5184ee +size 141099 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e7b63b59-2efa-4686-931e-82204af2158a.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e7b63b59-2efa-4686-931e-82204af2158a.lance new file mode 100644 index 0000000000000000000000000000000000000000..bfd891cc6be2dd90897f69caede3b8ee5732a4d9 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e7b63b59-2efa-4686-931e-82204af2158a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21aba195a0a896ceb1050acbbd5c40d21ada52f6e234f0c1a952b096b893010b +size 138265 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e8697bf2-a037-43bf-9e65-c30d96afd9b0.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e8697bf2-a037-43bf-9e65-c30d96afd9b0.lance new file mode 100644 index 0000000000000000000000000000000000000000..8e6b058347f8bb307e8e593aca41d7220b5f26e6 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e8697bf2-a037-43bf-9e65-c30d96afd9b0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b04f17fe732edc0eac86491b97679307d8642f0b2b4906b628227b3bebfed5d +size 140720 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e8e43edc-f1b4-48f7-8e53-0696d0765ab4.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e8e43edc-f1b4-48f7-8e53-0696d0765ab4.lance new file mode 100644 index 0000000000000000000000000000000000000000..b01f7f042805685fddb6fcb2947b38d622e2a36d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e8e43edc-f1b4-48f7-8e53-0696d0765ab4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74883592a8ec7347c1bab9247e3e06b7988397f42a77623fee41269d4cd71f90 +size 140109 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/e9d73cd4-cace-4362-ad0d-32dbe945e50d.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/e9d73cd4-cace-4362-ad0d-32dbe945e50d.lance new file mode 100644 index 0000000000000000000000000000000000000000..c5efefef25f9b264e6787c8ec52fec5104db9c1d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/e9d73cd4-cace-4362-ad0d-32dbe945e50d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61f297f0a5b9e60011ca8cdddab1b0212dc352954efe7d9c2106d30045003a4a +size 150026 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/ea121175-6402-4006-8163-e5d35afe2d20.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/ea121175-6402-4006-8163-e5d35afe2d20.lance new file mode 100644 index 0000000000000000000000000000000000000000..41faec9f558e6c1e86061eb0352d75b8d4472ec7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/ea121175-6402-4006-8163-e5d35afe2d20.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5f5205d69b687fa23c965cccdd08ca260dde077d8e712226fe613f6a01638bf +size 139241 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/eaae54cf-c127-4d4f-b0ff-f45ac6ba40d4.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/eaae54cf-c127-4d4f-b0ff-f45ac6ba40d4.lance new file mode 100644 index 0000000000000000000000000000000000000000..928eb8021e390cfd2062fb3fbc10025a4ed2a7d7 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/eaae54cf-c127-4d4f-b0ff-f45ac6ba40d4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b12bf376ed36db30ca22ddc7c1d551d020800f39d80179005611602816c39256 +size 141200 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/ec85c7dd-9609-407f-85c0-01dcb2283d9c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/ec85c7dd-9609-407f-85c0-01dcb2283d9c.lance new file mode 100644 index 0000000000000000000000000000000000000000..b052df4764a41b8333aa89500df477b9472989c8 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/ec85c7dd-9609-407f-85c0-01dcb2283d9c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ad765c1dfed208984974496dd68059c273f8b537856034417c7ba7f93e76b74 +size 145689 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/ee19e6ce-c682-4f83-81d9-3437d1f46375.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/ee19e6ce-c682-4f83-81d9-3437d1f46375.lance new file mode 100644 index 0000000000000000000000000000000000000000..41b2274307fa8ebf09b7c03413a07b1027237f26 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/ee19e6ce-c682-4f83-81d9-3437d1f46375.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:178a332891c72b7944c9a915e35d118745607857c5e0e7e8d9f30febe048f04a +size 142804 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/ef76799e-4cd2-4109-9992-226ab16916a7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/ef76799e-4cd2-4109-9992-226ab16916a7.lance new file mode 100644 index 0000000000000000000000000000000000000000..72d218c4acd0178945b58ad0f0690208298617d1 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/ef76799e-4cd2-4109-9992-226ab16916a7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51345895c1287564fabcd69a9881d925babb5318af19a041726252c58c2a5c33 +size 143204 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f002c082-f1d9-4647-a8bd-75a3085f3d71.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f002c082-f1d9-4647-a8bd-75a3085f3d71.lance new file mode 100644 index 0000000000000000000000000000000000000000..db5ef86ffa53e58342c3718f43d403031ac1775e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f002c082-f1d9-4647-a8bd-75a3085f3d71.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dfa700f5e4b5dcd494042895147e493c635b9223dc880346d3b1164b9b7e432 +size 139444 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f0133287-6048-450b-b9ec-8eeace9d53a7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f0133287-6048-450b-b9ec-8eeace9d53a7.lance new file mode 100644 index 0000000000000000000000000000000000000000..fce3f3b9645d337d53580fcad061805985685f73 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f0133287-6048-450b-b9ec-8eeace9d53a7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e72e43d830df921cfbeac1474fc4591499794bf3e042ff617c307ba5145c0dc8 +size 136847 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f0fa4a87-5687-4319-a0ea-710db14de3ae.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f0fa4a87-5687-4319-a0ea-710db14de3ae.lance new file mode 100644 index 0000000000000000000000000000000000000000..707ef443c8b8cfa1be47b3a99a55e3e0c3a578d5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f0fa4a87-5687-4319-a0ea-710db14de3ae.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5969be79faae3e1f3f4a1ad01bc82e2424ce8a87695d88f151adafbcdda0cc90 +size 139139 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f12f18dc-4542-442a-a4b9-03893206ff43.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f12f18dc-4542-442a-a4b9-03893206ff43.lance new file mode 100644 index 0000000000000000000000000000000000000000..ecf4c9e0b1ce629e9b90c93313717f053d945678 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f12f18dc-4542-442a-a4b9-03893206ff43.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:189c6a86087c2ac99f1a330aaff6f43ad151d029dfa5af0c1e79eb7ef402be5b +size 140657 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f1ce5af7-8a47-4993-b5d8-a2bcdfec4b56.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f1ce5af7-8a47-4993-b5d8-a2bcdfec4b56.lance new file mode 100644 index 0000000000000000000000000000000000000000..135ce85167edd032bba71190c262616a8cefa3e5 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f1ce5af7-8a47-4993-b5d8-a2bcdfec4b56.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9280520757b2968e3c355e783df2688a65182a64866f6c5bd06f5a7a5fe9cf4 +size 138204 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f38d26c7-5498-45f3-b1db-116b90b09b8e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f38d26c7-5498-45f3-b1db-116b90b09b8e.lance new file mode 100644 index 0000000000000000000000000000000000000000..a664ec4741345086bbfc453f79317afddb071c1f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f38d26c7-5498-45f3-b1db-116b90b09b8e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a90fd43490cf0bb401a8df52749f7b1198c45f8c7bf32c0678e2f5cea9891507 +size 127958 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f4622781-52ab-4b4f-8322-0161356e4546.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f4622781-52ab-4b4f-8322-0161356e4546.lance new file mode 100644 index 0000000000000000000000000000000000000000..04b82a6611efbc31dceb8de2e1331840a3ce2966 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f4622781-52ab-4b4f-8322-0161356e4546.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f64114b81e79ac04fbe49df4d9b149b655ec0995d3256d95b6ff2ff7d9df5e0 +size 142267 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f46c57d1-b3b9-42ec-9fda-5ef57438193b.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f46c57d1-b3b9-42ec-9fda-5ef57438193b.lance new file mode 100644 index 0000000000000000000000000000000000000000..c2aac3aa6dd07fcf38abab0425f88557ebd3acdb --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f46c57d1-b3b9-42ec-9fda-5ef57438193b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:730ed3f9adde5430f2528c6ccb128e9fd950b39a2eb2f04d78eccaeda7bae113 +size 143313 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f47cf2c4-5177-4b2d-813a-3458e1df9232.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f47cf2c4-5177-4b2d-813a-3458e1df9232.lance new file mode 100644 index 0000000000000000000000000000000000000000..937d0d92356778c41b42722a60195e0009ec8ff1 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f47cf2c4-5177-4b2d-813a-3458e1df9232.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d9997282d078a1d750b8baed1bcf7792b4e6b8c97bd0c45bb1bbef13e897ede +size 134874 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f49609d2-da41-4f95-9490-3d02c62b5f2e.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f49609d2-da41-4f95-9490-3d02c62b5f2e.lance new file mode 100644 index 0000000000000000000000000000000000000000..df09c9ccb21cb4bdfad3f1b0e107590fbfe1c94f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f49609d2-da41-4f95-9490-3d02c62b5f2e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22b67cfe2de011fd6f0be92be039abcf33ecb1b0ad9c4ea5339e860c40e19156 +size 136617 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f4a2044f-e401-4f51-a123-dc6e9b390072.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f4a2044f-e401-4f51-a123-dc6e9b390072.lance new file mode 100644 index 0000000000000000000000000000000000000000..3fd8427312ed1dcbb8455a10621e461c3c9c2c0b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f4a2044f-e401-4f51-a123-dc6e9b390072.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc1daa10e84b08ab137433a7170ee036512fe2c02ee49b83f134f9ca2ddcb8d1 +size 140003 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f4b28670-95d7-41ea-9769-e6ca08634dd2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f4b28670-95d7-41ea-9769-e6ca08634dd2.lance new file mode 100644 index 0000000000000000000000000000000000000000..4db7b254251307800861e0d06286bb15385cb54b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f4b28670-95d7-41ea-9769-e6ca08634dd2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1bd2f015d34b95061ffb52419d83d1a5aaa8e5f4271bf543525ea893fd93e9c +size 136044 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f617e947-2684-4461-a45f-773cbb55cc9b.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f617e947-2684-4461-a45f-773cbb55cc9b.lance new file mode 100644 index 0000000000000000000000000000000000000000..221f4eacaca5bcaed293d453306be6540339c9d6 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f617e947-2684-4461-a45f-773cbb55cc9b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fd1e7945845fc305cc9d3bb2fbeba4a97d05c06c2c849a09d1e1d0afadcc0bf +size 140120 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f61e1f99-d789-4002-9bdc-bf22835652d7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f61e1f99-d789-4002-9bdc-bf22835652d7.lance new file mode 100644 index 0000000000000000000000000000000000000000..2844c6f1cef0f43e3f6969d5669f197764053c84 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f61e1f99-d789-4002-9bdc-bf22835652d7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3a39506cbc199083018920df8035c45211dfb354acc2bd6cbd1f6d91465af10 +size 141144 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f6c64faa-19e3-4cb1-88bb-d96ad6884c9c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f6c64faa-19e3-4cb1-88bb-d96ad6884c9c.lance new file mode 100644 index 0000000000000000000000000000000000000000..14cd780d81606c20aae567734b742e847e744e7e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f6c64faa-19e3-4cb1-88bb-d96ad6884c9c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed49793316c2d6905caecd499d775e432f426b4613dd45f265cdb05971f5bdca +size 147121 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f6da8301-378f-4a48-ab60-3c33c1e86bd0.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f6da8301-378f-4a48-ab60-3c33c1e86bd0.lance new file mode 100644 index 0000000000000000000000000000000000000000..092272b3b0b14614895ee1233379a42f3a21d284 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f6da8301-378f-4a48-ab60-3c33c1e86bd0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0462ce49ba21526c8c06c32e03ac6839f3353764ed126f9209544eab6982559 +size 139763 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f78d743e-67a8-478f-880c-30eda120f52f.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f78d743e-67a8-478f-880c-30eda120f52f.lance new file mode 100644 index 0000000000000000000000000000000000000000..cd34ab2b5e7dfeaaa5a0e95067987fa8fb2e8f5f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f78d743e-67a8-478f-880c-30eda120f52f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7de3f5f6bcc526bc2c13443d0d1de1ae143dfec6d6c0a18193a03c94de84bdf9 +size 141877 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f9119d1b-bb50-41e6-81ac-f45ed62a1510.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f9119d1b-bb50-41e6-81ac-f45ed62a1510.lance new file mode 100644 index 0000000000000000000000000000000000000000..5934e8bf256f46245e49e38a961c799b85f705ec --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f9119d1b-bb50-41e6-81ac-f45ed62a1510.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7be0ee7c96f7928e81efc23e7f3918af7cfadec591b32d97f7e76277f5b13b34 +size 138179 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f99567cf-be07-457c-b16a-9de9fe710fdb.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f99567cf-be07-457c-b16a-9de9fe710fdb.lance new file mode 100644 index 0000000000000000000000000000000000000000..e3434a66365291dc68f1b66cee7169d7ff6ffe8d --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f99567cf-be07-457c-b16a-9de9fe710fdb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ea82a58a814b7d7a38926555d785ec9f63fe097a6b9b6e54ac7ce0902178f8d +size 139679 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/f9dd6132-180f-4400-8a18-450a87dbe7e7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/f9dd6132-180f-4400-8a18-450a87dbe7e7.lance new file mode 100644 index 0000000000000000000000000000000000000000..c3b6640ed60bd1f78a96e5a82ace05f2b25a9f3f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/f9dd6132-180f-4400-8a18-450a87dbe7e7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f2ea840353ea37f22a6f2b0bbad07c5479a89c2fdf8e398a81f01d00d5e1ab4 +size 145098 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/fa17e307-cc71-4a25-858f-b08d3f136881.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/fa17e307-cc71-4a25-858f-b08d3f136881.lance new file mode 100644 index 0000000000000000000000000000000000000000..1305ab9403cc7a67795388ec491a64b39e46bd02 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/fa17e307-cc71-4a25-858f-b08d3f136881.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a69dc98c9e1217a888a0fa7b9c50d42e5e01cbe062aa7aa9feef6587dcbae83e +size 145342 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/fa52877a-8a09-477c-b727-381c8429cde2.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/fa52877a-8a09-477c-b727-381c8429cde2.lance new file mode 100644 index 0000000000000000000000000000000000000000..6acb07b36fa9826e0088e79cf4cb75eca8bab19e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/fa52877a-8a09-477c-b727-381c8429cde2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11b03a1d6665c3a75a6cdb5498a3a9535cb00d4e5d2757e5adc2ba1bbd0d292e +size 139568 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/fa90ad69-52f6-4652-9be2-2d5bd6383a6c.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/fa90ad69-52f6-4652-9be2-2d5bd6383a6c.lance new file mode 100644 index 0000000000000000000000000000000000000000..a1810bedaa7d51e8b06a13a401390cd068f1ba59 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/fa90ad69-52f6-4652-9be2-2d5bd6383a6c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fd7f64c40d764d64bc8c5b93e16e79db9da9e1c46d2d6eb290d85252ff3a65e +size 146391 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/fae550e4-9a00-479f-a757-1654cd14dfb4.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/fae550e4-9a00-479f-a757-1654cd14dfb4.lance new file mode 100644 index 0000000000000000000000000000000000000000..8578d340eab8083a9220ca07ad8e06cc22230860 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/fae550e4-9a00-479f-a757-1654cd14dfb4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:416066425267287625ae2f319e85b2833522bacfcf1a8acda29a0bd9916b8ff2 +size 142221 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/fc17b8d6-9ba7-4d73-bb6b-af42c8d0ee22.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/fc17b8d6-9ba7-4d73-bb6b-af42c8d0ee22.lance new file mode 100644 index 0000000000000000000000000000000000000000..bc77b6d1289a2f4117d3f44ffd48f17cd7b15035 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/fc17b8d6-9ba7-4d73-bb6b-af42c8d0ee22.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5768a2cfd36c2267c1bfa4d8873dc5358813ed61a395039466cf00bf63fc064 +size 148975 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/fd7f2573-5353-4be4-a2b5-3280cbc504a7.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/fd7f2573-5353-4be4-a2b5-3280cbc504a7.lance new file mode 100644 index 0000000000000000000000000000000000000000..c65803c58307649440a81eb180086686c45df04e --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/fd7f2573-5353-4be4-a2b5-3280cbc504a7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ba973a90b86c524d62fd02fab00e056e430de7bb1253ba8a3bfe94848f0c0c5 +size 136495 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/fde6b872-3192-4a29-a91c-6f50841663c4.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/fde6b872-3192-4a29-a91c-6f50841663c4.lance new file mode 100644 index 0000000000000000000000000000000000000000..75b9fbaa7652cf74b5bf87c84407b749e2ed525f --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/fde6b872-3192-4a29-a91c-6f50841663c4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f900783666994464cb1f8b004e3af8cdceef1bb930e77df67b45a9363f636b81 +size 138546 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/fdf905c0-f9b3-437c-91b5-a4b249616a69.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/fdf905c0-f9b3-437c-91b5-a4b249616a69.lance new file mode 100644 index 0000000000000000000000000000000000000000..37d61f1032b848398292990ffb68ee142326dca1 --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/fdf905c0-f9b3-437c-91b5-a4b249616a69.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee3366096d2d76af1a1155a80505688bb1bfd6b30bb7cfe6e9154ee4ff367438 +size 142291 diff --git a/.lancedb/content_aware_chunking_BAAI.lance/data/ffdddcf5-996e-4d39-9362-7edb58852072.lance b/.lancedb/content_aware_chunking_BAAI.lance/data/ffdddcf5-996e-4d39-9362-7edb58852072.lance new file mode 100644 index 0000000000000000000000000000000000000000..ea74f22bbf9ac713e94328a54a5ca391f49fc62b --- /dev/null +++ b/.lancedb/content_aware_chunking_BAAI.lance/data/ffdddcf5-996e-4d39-9362-7edb58852072.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7e9e2cd61b9ca25f797bd41182b711701f49ff386e81a03f722886e186fa80e +size 136228 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_indices/a54581e9-cd02-4204-95bd-4f840814eec5/index.idx b/.lancedb/fixed_size_chunking_BAAI.lance/_indices/a54581e9-cd02-4204-95bd-4f840814eec5/index.idx new file mode 100644 index 0000000000000000000000000000000000000000..3ec65884a4e1de5b25857198bc16355853a7a568 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/_indices/a54581e9-cd02-4204-95bd-4f840814eec5/index.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3018c0afc5e5a314718f86c8f01b952c1a01aa061f450736f8450a5de90b4088 +size 2213352 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_latest.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_latest.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4862ed17b2c51926b8dc168d8e94cfd4b675a4bd Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_latest.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/0-317783f6-28cd-4fb7-980c-c635e5288a47.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/0-317783f6-28cd-4fb7-980c-c635e5288a47.txn new file mode 100644 index 0000000000000000000000000000000000000000..26be762d12a27bdcb9efc164bb154f4660e37622 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/0-317783f6-28cd-4fb7-980c-c635e5288a47.txn @@ -0,0 +1 @@ +$317783f6-28cd-4fb7-980c-c635e5288a47²V3vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:102408text ÿÿÿÿÿÿÿÿÿ*string08 \ No newline at end of file diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/1-9c61b364-9b63-491f-bf73-78d5b90aa0f2.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/1-9c61b364-9b63-491f-bf73-78d5b90aa0f2.txn new file mode 100644 index 0000000000000000000000000000000000000000..6c29f1f8efdb880e3c5941bf33dd74c346281a12 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/1-9c61b364-9b63-491f-bf73-78d5b90aa0f2.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/10-bbd12e40-867c-40ca-b85f-5c4d3f3b5ce4.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/10-bbd12e40-867c-40ca-b85f-5c4d3f3b5ce4.txn new file mode 100644 index 0000000000000000000000000000000000000000..2ba5f849572049196d31b3e465bd72df29cbe828 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/10-bbd12e40-867c-40ca-b85f-5c4d3f3b5ce4.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/100-432ba448-fd2a-418e-b58f-6684ed2bc4ee.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/100-432ba448-fd2a-418e-b58f-6684ed2bc4ee.txn new file mode 100644 index 0000000000000000000000000000000000000000..57eaa3a6d5e348fcb73f54c51be15298429ab8cb Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/100-432ba448-fd2a-418e-b58f-6684ed2bc4ee.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/101-9fdeff98-b899-4498-9f26-56db9b020bd8.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/101-9fdeff98-b899-4498-9f26-56db9b020bd8.txn new file mode 100644 index 0000000000000000000000000000000000000000..a77a55f061d7ced152f488b94a1a64f9812419a6 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/101-9fdeff98-b899-4498-9f26-56db9b020bd8.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/102-784c0d1e-a949-4c18-9134-30eb6954e4cd.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/102-784c0d1e-a949-4c18-9134-30eb6954e4cd.txn new file mode 100644 index 0000000000000000000000000000000000000000..bb2e5407ea6e575665d7fdd21dac77e91c253e1e Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/102-784c0d1e-a949-4c18-9134-30eb6954e4cd.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/103-3269e139-bd5c-41dc-85d1-80ebff21d651.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/103-3269e139-bd5c-41dc-85d1-80ebff21d651.txn new file mode 100644 index 0000000000000000000000000000000000000000..a3e04e13d7b5d44d966452649a31b7ae619d2aa0 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/103-3269e139-bd5c-41dc-85d1-80ebff21d651.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/104-650f8643-001b-43cd-bd72-68478b581add.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/104-650f8643-001b-43cd-bd72-68478b581add.txn new file mode 100644 index 0000000000000000000000000000000000000000..f6cdb7277cbeaa96c157d28c39ff63725849e3e5 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/104-650f8643-001b-43cd-bd72-68478b581add.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/105-eba5af7d-770d-4005-8db3-d967fc90fec1.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/105-eba5af7d-770d-4005-8db3-d967fc90fec1.txn new file mode 100644 index 0000000000000000000000000000000000000000..aafbe9a3118fc85e307420be919f4289af73fc92 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/105-eba5af7d-770d-4005-8db3-d967fc90fec1.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/106-28bd897a-81e2-4040-aab8-0029577590cf.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/106-28bd897a-81e2-4040-aab8-0029577590cf.txn new file mode 100644 index 0000000000000000000000000000000000000000..ed246184f748881a674c6ff168df09d5094b6281 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/106-28bd897a-81e2-4040-aab8-0029577590cf.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/107-8546e9eb-bc9a-46dc-9fb2-6c6bb80cb0e2.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/107-8546e9eb-bc9a-46dc-9fb2-6c6bb80cb0e2.txn new file mode 100644 index 0000000000000000000000000000000000000000..fe8d2f6a79c4e3e6a1d3102f360d4f00728ce090 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/107-8546e9eb-bc9a-46dc-9fb2-6c6bb80cb0e2.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/108-14f5d2f2-712f-4cd2-8a00-65cfc5d5bc8d.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/108-14f5d2f2-712f-4cd2-8a00-65cfc5d5bc8d.txn new file mode 100644 index 0000000000000000000000000000000000000000..5d7cd2c64016076070c821974f8a10fb1477e8ed Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/108-14f5d2f2-712f-4cd2-8a00-65cfc5d5bc8d.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/109-9e6e01e3-d302-4523-bee6-5ac651c9f46a.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/109-9e6e01e3-d302-4523-bee6-5ac651c9f46a.txn new file mode 100644 index 0000000000000000000000000000000000000000..ce3c475dd579dbc765ae87edc4d0e6684b8cd458 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/109-9e6e01e3-d302-4523-bee6-5ac651c9f46a.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/11-27a88ab1-a9a5-4440-bb6d-fbd8935a7e94.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/11-27a88ab1-a9a5-4440-bb6d-fbd8935a7e94.txn new file mode 100644 index 0000000000000000000000000000000000000000..f59d3a31ff4ab3c0bb80e1975427f8a8350a3a4d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/11-27a88ab1-a9a5-4440-bb6d-fbd8935a7e94.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/110-1ff279af-995e-4505-9d23-bea214b9c5ab.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/110-1ff279af-995e-4505-9d23-bea214b9c5ab.txn new file mode 100644 index 0000000000000000000000000000000000000000..b34f1b9e8fa79943b67aa826c9ab13d7156e85fa Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/110-1ff279af-995e-4505-9d23-bea214b9c5ab.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/111-7a410065-7558-4bd2-bb4b-fdc7af7ae2af.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/111-7a410065-7558-4bd2-bb4b-fdc7af7ae2af.txn new file mode 100644 index 0000000000000000000000000000000000000000..7fcd195151caab1ecd9c55b134ec90313671970a Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/111-7a410065-7558-4bd2-bb4b-fdc7af7ae2af.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/112-270e22f1-ee9d-4e1b-93b8-031efc123c5e.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/112-270e22f1-ee9d-4e1b-93b8-031efc123c5e.txn new file mode 100644 index 0000000000000000000000000000000000000000..9921484a407e17cbbbc974487a1b7e15c8a1cdf0 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/112-270e22f1-ee9d-4e1b-93b8-031efc123c5e.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/113-089a3b65-560b-48f7-8f98-fd0ae0b01945.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/113-089a3b65-560b-48f7-8f98-fd0ae0b01945.txn new file mode 100644 index 0000000000000000000000000000000000000000..7a10c1b605d3f4a6f53f4b2ff9bf0a8067d52f72 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/113-089a3b65-560b-48f7-8f98-fd0ae0b01945.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/114-21015bb2-2f46-466a-a5e9-f0c25ece03ad.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/114-21015bb2-2f46-466a-a5e9-f0c25ece03ad.txn new file mode 100644 index 0000000000000000000000000000000000000000..fbb60a7035cb7b784623d369c910bf230f1a597f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/114-21015bb2-2f46-466a-a5e9-f0c25ece03ad.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/115-9e1abdfc-c173-4c1e-8645-8f00a2bede3e.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/115-9e1abdfc-c173-4c1e-8645-8f00a2bede3e.txn new file mode 100644 index 0000000000000000000000000000000000000000..412b999c9710f5aa32da5b665149e385f93be748 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/115-9e1abdfc-c173-4c1e-8645-8f00a2bede3e.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/116-bebdf720-a011-44bc-970f-6443da836cfc.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/116-bebdf720-a011-44bc-970f-6443da836cfc.txn new file mode 100644 index 0000000000000000000000000000000000000000..ae4c5935808cc78ebfee5145f5ed20cf0c017b0e Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/116-bebdf720-a011-44bc-970f-6443da836cfc.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/117-1cca55dc-f377-47ae-a91f-a9c19cd9797d.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/117-1cca55dc-f377-47ae-a91f-a9c19cd9797d.txn new file mode 100644 index 0000000000000000000000000000000000000000..2394f3df359e5ff2c57c661acd226a8332462cb1 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/117-1cca55dc-f377-47ae-a91f-a9c19cd9797d.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/118-b7716017-2212-4c7d-b349-f10bf167bf56.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/118-b7716017-2212-4c7d-b349-f10bf167bf56.txn new file mode 100644 index 0000000000000000000000000000000000000000..678fe434270e84f73ea70b1dc1a91b77300abc52 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/118-b7716017-2212-4c7d-b349-f10bf167bf56.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/119-5d0ec588-7414-4770-9900-ce481bb001b3.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/119-5d0ec588-7414-4770-9900-ce481bb001b3.txn new file mode 100644 index 0000000000000000000000000000000000000000..de8ee10be0c7be7cd8114a74d96ad62def74f57d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/119-5d0ec588-7414-4770-9900-ce481bb001b3.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/12-ac5441f1-4599-4086-942e-a4903c3fed7e.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/12-ac5441f1-4599-4086-942e-a4903c3fed7e.txn new file mode 100644 index 0000000000000000000000000000000000000000..2e272fb2af8c1926967dabb729861ce8b1b6b4af Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/12-ac5441f1-4599-4086-942e-a4903c3fed7e.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/120-49d384dc-0ca3-4fe2-ad8e-9ee254e104d2.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/120-49d384dc-0ca3-4fe2-ad8e-9ee254e104d2.txn new file mode 100644 index 0000000000000000000000000000000000000000..96328e88e5c490d0a8f800c5c246a02766161ead Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/120-49d384dc-0ca3-4fe2-ad8e-9ee254e104d2.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/121-332a818a-acc9-4dd8-8cd9-d3f983a96732.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/121-332a818a-acc9-4dd8-8cd9-d3f983a96732.txn new file mode 100644 index 0000000000000000000000000000000000000000..d1f051fd71f6c74655fe1f1faedbff721064a739 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/121-332a818a-acc9-4dd8-8cd9-d3f983a96732.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/122-5cde8dc3-7e6f-4452-b110-100b9c28f977.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/122-5cde8dc3-7e6f-4452-b110-100b9c28f977.txn new file mode 100644 index 0000000000000000000000000000000000000000..50fc6bdc98e696ffeda295daf4c6b90a36c525b0 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/122-5cde8dc3-7e6f-4452-b110-100b9c28f977.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/123-97c3d6ed-1cf5-4e0c-bae3-4448bfb15915.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/123-97c3d6ed-1cf5-4e0c-bae3-4448bfb15915.txn new file mode 100644 index 0000000000000000000000000000000000000000..ac7465e83a8ce0fb271e6ea82acb7eda02f7ab67 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/123-97c3d6ed-1cf5-4e0c-bae3-4448bfb15915.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/124-3765d72b-0522-4868-9b77-7cd9154d6c58.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/124-3765d72b-0522-4868-9b77-7cd9154d6c58.txn new file mode 100644 index 0000000000000000000000000000000000000000..c26631f93bda7deb03a0f2e885b7e267f7cbdd46 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/124-3765d72b-0522-4868-9b77-7cd9154d6c58.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/125-e2102385-2a10-45a2-adca-e924a13101e4.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/125-e2102385-2a10-45a2-adca-e924a13101e4.txn new file mode 100644 index 0000000000000000000000000000000000000000..c7b20cfafccf26dad8a806bb20ec41cdface031d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/125-e2102385-2a10-45a2-adca-e924a13101e4.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/126-1a4cdce2-d83d-4dcb-968e-c34e7590faa1.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/126-1a4cdce2-d83d-4dcb-968e-c34e7590faa1.txn new file mode 100644 index 0000000000000000000000000000000000000000..f0d1b8d462d72188f99bbfe32a40daa17b9a126d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/126-1a4cdce2-d83d-4dcb-968e-c34e7590faa1.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/127-e98ad5d0-a496-4460-8d23-b98166032a34.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/127-e98ad5d0-a496-4460-8d23-b98166032a34.txn new file mode 100644 index 0000000000000000000000000000000000000000..8ad4af53643a4ad692bab0f4b1c56944c9bb8c5d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/127-e98ad5d0-a496-4460-8d23-b98166032a34.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/128-4b25226f-ca37-47f9-b734-6d94aa29ff0b.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/128-4b25226f-ca37-47f9-b734-6d94aa29ff0b.txn new file mode 100644 index 0000000000000000000000000000000000000000..4913f3eea3e60259badd1202ad0a6eeba29f230d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/128-4b25226f-ca37-47f9-b734-6d94aa29ff0b.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/129-52631c19-2556-41c2-948a-4082ae0cba64.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/129-52631c19-2556-41c2-948a-4082ae0cba64.txn new file mode 100644 index 0000000000000000000000000000000000000000..1c890cdfe924fbda3a1339a6b60bee668e58a69f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/129-52631c19-2556-41c2-948a-4082ae0cba64.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/13-40eedef8-2db5-40d9-84d5-1b0e98f2ce8c.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/13-40eedef8-2db5-40d9-84d5-1b0e98f2ce8c.txn new file mode 100644 index 0000000000000000000000000000000000000000..da1b691c4824d8cecbd8b20271771205e935abce Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/13-40eedef8-2db5-40d9-84d5-1b0e98f2ce8c.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/130-5dc65260-76c8-4779-a4e8-7111a52c791d.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/130-5dc65260-76c8-4779-a4e8-7111a52c791d.txn new file mode 100644 index 0000000000000000000000000000000000000000..d037a0dd34663b2336bd74323c7792f998e5f8e1 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/130-5dc65260-76c8-4779-a4e8-7111a52c791d.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/131-7d5bba36-d89d-45de-a272-cbc2af403fe4.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/131-7d5bba36-d89d-45de-a272-cbc2af403fe4.txn new file mode 100644 index 0000000000000000000000000000000000000000..c557e16dba0a60b275444288653e696d43cc03ef Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/131-7d5bba36-d89d-45de-a272-cbc2af403fe4.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/132-bc51c989-1eac-4c5d-871a-85069060d003.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/132-bc51c989-1eac-4c5d-871a-85069060d003.txn new file mode 100644 index 0000000000000000000000000000000000000000..621e07fc156f17a5fe474f6f86ddc2c470b51851 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/132-bc51c989-1eac-4c5d-871a-85069060d003.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/133-beea9e5c-5437-464f-b2e2-4189ceade9b4.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/133-beea9e5c-5437-464f-b2e2-4189ceade9b4.txn new file mode 100644 index 0000000000000000000000000000000000000000..79f5813cf506e2f1aca3e082dde4091ae3b851ec Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/133-beea9e5c-5437-464f-b2e2-4189ceade9b4.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/134-efdaee90-3dc5-4d67-8f3b-5f9e12656106.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/134-efdaee90-3dc5-4d67-8f3b-5f9e12656106.txn new file mode 100644 index 0000000000000000000000000000000000000000..af8855e772956d91ca19845640fcbc9f00cbe591 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/134-efdaee90-3dc5-4d67-8f3b-5f9e12656106.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/135-fb7a3673-5473-4e2c-8812-3fef04d19902.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/135-fb7a3673-5473-4e2c-8812-3fef04d19902.txn new file mode 100644 index 0000000000000000000000000000000000000000..2c96e84963b0de6a8ff3192ff97210e570a60d5c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/135-fb7a3673-5473-4e2c-8812-3fef04d19902.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/136-d8eb6f27-8178-4dcb-aa35-6cd8f45e97ea.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/136-d8eb6f27-8178-4dcb-aa35-6cd8f45e97ea.txn new file mode 100644 index 0000000000000000000000000000000000000000..d227156370a48263a2a9a6d38f0ec69fa7b9f70c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/136-d8eb6f27-8178-4dcb-aa35-6cd8f45e97ea.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/137-8665323e-e53c-4fe2-90c7-9cdc2e8c130e.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/137-8665323e-e53c-4fe2-90c7-9cdc2e8c130e.txn new file mode 100644 index 0000000000000000000000000000000000000000..fac96ab34ea817cdd486697a47e23b8df9433d51 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/137-8665323e-e53c-4fe2-90c7-9cdc2e8c130e.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/138-675f97b0-28df-4e0d-980b-c09aa6f45c59.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/138-675f97b0-28df-4e0d-980b-c09aa6f45c59.txn new file mode 100644 index 0000000000000000000000000000000000000000..2d30b5c1557b5f788c9cbbcf2bbd75e340cc5642 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/138-675f97b0-28df-4e0d-980b-c09aa6f45c59.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/139-ca56d10c-7373-4b05-a46e-9ba0f90acce2.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/139-ca56d10c-7373-4b05-a46e-9ba0f90acce2.txn new file mode 100644 index 0000000000000000000000000000000000000000..7ddb7458ad04bc99bef381568c1730da62b57f2d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/139-ca56d10c-7373-4b05-a46e-9ba0f90acce2.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/14-d766be29-bd20-4cbe-8a1e-f26d999a5cd9.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/14-d766be29-bd20-4cbe-8a1e-f26d999a5cd9.txn new file mode 100644 index 0000000000000000000000000000000000000000..7b966798bad29a6d247c8e147097a1315ff5fff1 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/14-d766be29-bd20-4cbe-8a1e-f26d999a5cd9.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/140-79df8b77-c4cf-431f-95f7-d7b22c9b2609.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/140-79df8b77-c4cf-431f-95f7-d7b22c9b2609.txn new file mode 100644 index 0000000000000000000000000000000000000000..a263edcc7d598f7c877b1122570e39b4fc1f4fc9 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/140-79df8b77-c4cf-431f-95f7-d7b22c9b2609.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/141-c95a0001-7bd4-41ee-b943-3c8155106c8f.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/141-c95a0001-7bd4-41ee-b943-3c8155106c8f.txn new file mode 100644 index 0000000000000000000000000000000000000000..0f46b5fcb7159adfe98c2f1299c07cb97c0c1e45 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/141-c95a0001-7bd4-41ee-b943-3c8155106c8f.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/142-c5da14f4-1fa3-48cb-9333-08a5a57c6625.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/142-c5da14f4-1fa3-48cb-9333-08a5a57c6625.txn new file mode 100644 index 0000000000000000000000000000000000000000..e03d84ce9c6400632fa0d41ad0ca1a5c70461537 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/142-c5da14f4-1fa3-48cb-9333-08a5a57c6625.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/143-d72ebd86-946c-49fa-b830-305126e08d70.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/143-d72ebd86-946c-49fa-b830-305126e08d70.txn new file mode 100644 index 0000000000000000000000000000000000000000..55f0d7b51c80eb6f1f723a1fdc0743c9344cdc76 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/143-d72ebd86-946c-49fa-b830-305126e08d70.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/144-6f887ea5-53b1-4396-a94e-654bf70e8a08.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/144-6f887ea5-53b1-4396-a94e-654bf70e8a08.txn new file mode 100644 index 0000000000000000000000000000000000000000..c51cfa77afc511e45dfa1397c7a10ef0ed423490 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/144-6f887ea5-53b1-4396-a94e-654bf70e8a08.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/145-80dd5e91-d15c-4df9-9bb7-8f4df36ae266.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/145-80dd5e91-d15c-4df9-9bb7-8f4df36ae266.txn new file mode 100644 index 0000000000000000000000000000000000000000..0c2d25dad95d58b07260b0fc373f4c046a374678 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/145-80dd5e91-d15c-4df9-9bb7-8f4df36ae266.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/146-7b75f9c3-3ef5-427d-bb63-e5bc9f22e974.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/146-7b75f9c3-3ef5-427d-bb63-e5bc9f22e974.txn new file mode 100644 index 0000000000000000000000000000000000000000..2dd738bedff1a8030aaa0f63916f0e291e91fd24 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/146-7b75f9c3-3ef5-427d-bb63-e5bc9f22e974.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/147-782695d0-50de-4215-a7c0-fe77a9380659.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/147-782695d0-50de-4215-a7c0-fe77a9380659.txn new file mode 100644 index 0000000000000000000000000000000000000000..17f5d33dbf2cf48e05e8b15a09e474253913f6b9 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/147-782695d0-50de-4215-a7c0-fe77a9380659.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/148-d52cf42f-7bdf-445a-835e-2fa643559fea.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/148-d52cf42f-7bdf-445a-835e-2fa643559fea.txn new file mode 100644 index 0000000000000000000000000000000000000000..c9a07b1c6daf6e8000ff975d2c452e0ed3e44663 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/148-d52cf42f-7bdf-445a-835e-2fa643559fea.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/149-e007db67-cc7a-4ec8-85b4-62b7f7fd1e32.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/149-e007db67-cc7a-4ec8-85b4-62b7f7fd1e32.txn new file mode 100644 index 0000000000000000000000000000000000000000..a0af730c97e9a6c7c30f1e1d49757d5e0acedb59 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/149-e007db67-cc7a-4ec8-85b4-62b7f7fd1e32.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/15-eff9397a-ed5d-475b-a7b3-647840a7c3a9.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/15-eff9397a-ed5d-475b-a7b3-647840a7c3a9.txn new file mode 100644 index 0000000000000000000000000000000000000000..1b978ceac58b1659a3d81b80011a96e8f0d367ff Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/15-eff9397a-ed5d-475b-a7b3-647840a7c3a9.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/150-da44fe59-50d8-4078-8f6c-a4d943840666.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/150-da44fe59-50d8-4078-8f6c-a4d943840666.txn new file mode 100644 index 0000000000000000000000000000000000000000..84b345afdd6ad3805761d4edd4149c9c3518d22b Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/150-da44fe59-50d8-4078-8f6c-a4d943840666.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/151-c0c640ce-1bfb-402e-beea-0c73a08697d5.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/151-c0c640ce-1bfb-402e-beea-0c73a08697d5.txn new file mode 100644 index 0000000000000000000000000000000000000000..4e1f07463f70249bb8381854cc3fc045a82d9c61 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/151-c0c640ce-1bfb-402e-beea-0c73a08697d5.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/16-889171a7-0e56-47bb-b7e6-55d794f06f57.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/16-889171a7-0e56-47bb-b7e6-55d794f06f57.txn new file mode 100644 index 0000000000000000000000000000000000000000..3f6e1410a4fd495a8cdbebe2a4e0f7ae2c17329f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/16-889171a7-0e56-47bb-b7e6-55d794f06f57.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/17-d1b81d05-414f-4af2-95c7-703f182da413.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/17-d1b81d05-414f-4af2-95c7-703f182da413.txn new file mode 100644 index 0000000000000000000000000000000000000000..6ff07bb06d450fa49320a0631c84752e152df0e7 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/17-d1b81d05-414f-4af2-95c7-703f182da413.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/18-2f3c12da-eda6-4e4e-ad09-01b0419aebaa.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/18-2f3c12da-eda6-4e4e-ad09-01b0419aebaa.txn new file mode 100644 index 0000000000000000000000000000000000000000..e863a820f78aef30522ede0de8f968298e1f2cba Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/18-2f3c12da-eda6-4e4e-ad09-01b0419aebaa.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/19-631fd2b8-1201-471d-a14c-e004071f54d4.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/19-631fd2b8-1201-471d-a14c-e004071f54d4.txn new file mode 100644 index 0000000000000000000000000000000000000000..dacab8c4400ab8344922ec0eda97a0ce05516326 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/19-631fd2b8-1201-471d-a14c-e004071f54d4.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/2-ecaf822e-ee96-4f26-a114-815f106e6210.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/2-ecaf822e-ee96-4f26-a114-815f106e6210.txn new file mode 100644 index 0000000000000000000000000000000000000000..c1de2c394cefd14c2af236bc1ea54d0121a3d413 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/2-ecaf822e-ee96-4f26-a114-815f106e6210.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/20-3090330a-d600-4044-81cd-853faff4ddfd.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/20-3090330a-d600-4044-81cd-853faff4ddfd.txn new file mode 100644 index 0000000000000000000000000000000000000000..07c08d31e69f604cd7e4753e10784551368698cd Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/20-3090330a-d600-4044-81cd-853faff4ddfd.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/21-df591f21-6f4f-4f83-854d-8587ddd2d297.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/21-df591f21-6f4f-4f83-854d-8587ddd2d297.txn new file mode 100644 index 0000000000000000000000000000000000000000..32214e04c250c59440e19b3f685f4096e5fa3749 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/21-df591f21-6f4f-4f83-854d-8587ddd2d297.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/22-604dc13f-77b2-46cf-b4e2-dde90e454b13.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/22-604dc13f-77b2-46cf-b4e2-dde90e454b13.txn new file mode 100644 index 0000000000000000000000000000000000000000..42701b09e94859ae3ef998bc6f0929cacfe79afa Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/22-604dc13f-77b2-46cf-b4e2-dde90e454b13.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/23-4e3541d4-8fe8-49a2-8baf-6f275a6f46d4.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/23-4e3541d4-8fe8-49a2-8baf-6f275a6f46d4.txn new file mode 100644 index 0000000000000000000000000000000000000000..aa7047039a8eba8dbcfc23342774df68b86f49af Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/23-4e3541d4-8fe8-49a2-8baf-6f275a6f46d4.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/24-dc564e55-44af-42a7-87bb-86598f208623.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/24-dc564e55-44af-42a7-87bb-86598f208623.txn new file mode 100644 index 0000000000000000000000000000000000000000..0f2b7bb7ff80f31b8b21756f495c1cfae3f445b4 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/24-dc564e55-44af-42a7-87bb-86598f208623.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/25-57f5dcb9-a3bd-462c-b6fb-a3a88ef635a6.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/25-57f5dcb9-a3bd-462c-b6fb-a3a88ef635a6.txn new file mode 100644 index 0000000000000000000000000000000000000000..b312df2bc7b0697cf567b20d4fd1e2d1c2a9a74f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/25-57f5dcb9-a3bd-462c-b6fb-a3a88ef635a6.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/26-9e7401f2-a150-4bcc-bed4-79b3725a5e15.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/26-9e7401f2-a150-4bcc-bed4-79b3725a5e15.txn new file mode 100644 index 0000000000000000000000000000000000000000..de307fa0751f24792abc755e3366251f31c360b6 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/26-9e7401f2-a150-4bcc-bed4-79b3725a5e15.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/27-d7ea145d-2170-4054-b717-866092e80449.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/27-d7ea145d-2170-4054-b717-866092e80449.txn new file mode 100644 index 0000000000000000000000000000000000000000..0ae697089a61359e6476157fbff0b53c5413f0c0 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/27-d7ea145d-2170-4054-b717-866092e80449.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/28-e7895365-bfd7-419f-9b5e-c89410e82ae3.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/28-e7895365-bfd7-419f-9b5e-c89410e82ae3.txn new file mode 100644 index 0000000000000000000000000000000000000000..f96e07dad173d0ede808b5f52b00b1d323344a75 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/28-e7895365-bfd7-419f-9b5e-c89410e82ae3.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/29-0f6238b5-41ba-4b0f-b139-cd3ba18ba9ce.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/29-0f6238b5-41ba-4b0f-b139-cd3ba18ba9ce.txn new file mode 100644 index 0000000000000000000000000000000000000000..37e45dec96cb9097b3a263f5671359f31d5ebdc7 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/29-0f6238b5-41ba-4b0f-b139-cd3ba18ba9ce.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/3-d8c04433-3cd5-4723-bb86-4b70e31fb4e0.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/3-d8c04433-3cd5-4723-bb86-4b70e31fb4e0.txn new file mode 100644 index 0000000000000000000000000000000000000000..e359a75e858abc8d1112fbd6e7c3537090940b8b Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/3-d8c04433-3cd5-4723-bb86-4b70e31fb4e0.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/30-f8a4978d-9215-4943-a9ca-007b9ddd9c13.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/30-f8a4978d-9215-4943-a9ca-007b9ddd9c13.txn new file mode 100644 index 0000000000000000000000000000000000000000..47adebea15a6736667cc53150ab6ce6f8e1139aa Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/30-f8a4978d-9215-4943-a9ca-007b9ddd9c13.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/31-5c2d6fa8-b5c9-4f48-814b-d2b702a872f2.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/31-5c2d6fa8-b5c9-4f48-814b-d2b702a872f2.txn new file mode 100644 index 0000000000000000000000000000000000000000..f8e6c28073c211e7e0864016cd0bfe99ff3b4efb Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/31-5c2d6fa8-b5c9-4f48-814b-d2b702a872f2.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/32-714f4288-1e1b-4c0a-b844-de2ddbd6577f.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/32-714f4288-1e1b-4c0a-b844-de2ddbd6577f.txn new file mode 100644 index 0000000000000000000000000000000000000000..68d2266c6566c4e39c5a4ee98542f184ebf37f30 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/32-714f4288-1e1b-4c0a-b844-de2ddbd6577f.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/33-b544faca-f7ff-45db-bbac-21271d00d46e.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/33-b544faca-f7ff-45db-bbac-21271d00d46e.txn new file mode 100644 index 0000000000000000000000000000000000000000..7ecd55e696aa60dc6a9feef3f8881d7962790d95 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/33-b544faca-f7ff-45db-bbac-21271d00d46e.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/34-daa3441b-6652-4aee-83f2-bc3bb9d023fa.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/34-daa3441b-6652-4aee-83f2-bc3bb9d023fa.txn new file mode 100644 index 0000000000000000000000000000000000000000..d914b6eb934d7ea7b975faa02f62934f353e6d1c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/34-daa3441b-6652-4aee-83f2-bc3bb9d023fa.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/35-01111320-856f-4ae9-95ef-01be2222a4bf.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/35-01111320-856f-4ae9-95ef-01be2222a4bf.txn new file mode 100644 index 0000000000000000000000000000000000000000..71f779954c318dc580241aa83827137445728b6c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/35-01111320-856f-4ae9-95ef-01be2222a4bf.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/36-657f51dc-721b-4755-a433-d296386e1c9e.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/36-657f51dc-721b-4755-a433-d296386e1c9e.txn new file mode 100644 index 0000000000000000000000000000000000000000..7cf8ce13df71395a08842deaf0546c01505a34ef Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/36-657f51dc-721b-4755-a433-d296386e1c9e.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/37-4df2673d-b3a4-4ef1-b544-815932e782a4.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/37-4df2673d-b3a4-4ef1-b544-815932e782a4.txn new file mode 100644 index 0000000000000000000000000000000000000000..da06ee30bbb13e0c0abf2e31a832bc679fd7be7e Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/37-4df2673d-b3a4-4ef1-b544-815932e782a4.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/38-a7579f33-6642-4c05-8404-9175a82171d7.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/38-a7579f33-6642-4c05-8404-9175a82171d7.txn new file mode 100644 index 0000000000000000000000000000000000000000..d00d6371d355303562bfc67efc46a6b9de333551 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/38-a7579f33-6642-4c05-8404-9175a82171d7.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/39-8024d6be-4aa7-4f7e-aa92-8f617914e71d.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/39-8024d6be-4aa7-4f7e-aa92-8f617914e71d.txn new file mode 100644 index 0000000000000000000000000000000000000000..c387990973a180d901124d52b41e86e772dd9bfd Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/39-8024d6be-4aa7-4f7e-aa92-8f617914e71d.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/4-afc38d1b-8856-47ce-ba34-00ca66382415.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/4-afc38d1b-8856-47ce-ba34-00ca66382415.txn new file mode 100644 index 0000000000000000000000000000000000000000..89baa72cd05d54aab8eefec99605bb411d72b04d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/4-afc38d1b-8856-47ce-ba34-00ca66382415.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/40-002c5082-4916-4754-8cf7-6fc2f2d1dce6.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/40-002c5082-4916-4754-8cf7-6fc2f2d1dce6.txn new file mode 100644 index 0000000000000000000000000000000000000000..64e469ccddf877f765863fad0eced06268705805 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/40-002c5082-4916-4754-8cf7-6fc2f2d1dce6.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/41-71d9f7bd-b193-4339-971c-c577f630247b.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/41-71d9f7bd-b193-4339-971c-c577f630247b.txn new file mode 100644 index 0000000000000000000000000000000000000000..f08b09889784326dd487465a9a688f8dd5e9c179 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/41-71d9f7bd-b193-4339-971c-c577f630247b.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/42-3ef63428-4667-49bb-a8c5-8f7f8931b840.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/42-3ef63428-4667-49bb-a8c5-8f7f8931b840.txn new file mode 100644 index 0000000000000000000000000000000000000000..8595be47a221262c865f5500e4823fb7bbe41733 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/42-3ef63428-4667-49bb-a8c5-8f7f8931b840.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/43-93a61c1e-025b-4c2c-892f-3d43d1cc1a58.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/43-93a61c1e-025b-4c2c-892f-3d43d1cc1a58.txn new file mode 100644 index 0000000000000000000000000000000000000000..d7c763183c9e5e68f093db98b30739ed7bde1e70 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/43-93a61c1e-025b-4c2c-892f-3d43d1cc1a58.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/44-90882f7c-a460-408e-a411-721e1993f602.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/44-90882f7c-a460-408e-a411-721e1993f602.txn new file mode 100644 index 0000000000000000000000000000000000000000..b44e18b5155359f920876fa4d7c215736cee2493 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/44-90882f7c-a460-408e-a411-721e1993f602.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/45-1bd2bbe6-9cb9-4edd-8f41-f9a100e9f515.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/45-1bd2bbe6-9cb9-4edd-8f41-f9a100e9f515.txn new file mode 100644 index 0000000000000000000000000000000000000000..57ac9728770b2a36d0906a5c3a27fa3eb3571da1 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/45-1bd2bbe6-9cb9-4edd-8f41-f9a100e9f515.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/46-ca410e62-ddd4-4c2a-b6ce-ac53a089ad1c.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/46-ca410e62-ddd4-4c2a-b6ce-ac53a089ad1c.txn new file mode 100644 index 0000000000000000000000000000000000000000..cd0a15dfb32a5ba8158541f82b7eacfc4481efc6 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/46-ca410e62-ddd4-4c2a-b6ce-ac53a089ad1c.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/47-4f5e722b-8146-4135-92d4-f264276afea7.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/47-4f5e722b-8146-4135-92d4-f264276afea7.txn new file mode 100644 index 0000000000000000000000000000000000000000..7b2166b4cbd269f7e2f3239a6fc24be5dc00a276 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/47-4f5e722b-8146-4135-92d4-f264276afea7.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/48-07bf83e8-0ca9-4fb7-b701-2c28828ad543.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/48-07bf83e8-0ca9-4fb7-b701-2c28828ad543.txn new file mode 100644 index 0000000000000000000000000000000000000000..d5929270a5d3f1e0908d7ae903669a3176b98b24 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/48-07bf83e8-0ca9-4fb7-b701-2c28828ad543.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/49-3fee8b43-36c8-427c-a72f-40bf2ec82068.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/49-3fee8b43-36c8-427c-a72f-40bf2ec82068.txn new file mode 100644 index 0000000000000000000000000000000000000000..d860c0fa1b22905328a004ef9cc02a09255bf8cb Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/49-3fee8b43-36c8-427c-a72f-40bf2ec82068.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/5-57f29636-c98e-4e11-909f-640bfe5694dd.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/5-57f29636-c98e-4e11-909f-640bfe5694dd.txn new file mode 100644 index 0000000000000000000000000000000000000000..0db95bccd3eed59fe3d417d326ba9843fdd89ccc Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/5-57f29636-c98e-4e11-909f-640bfe5694dd.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/50-d53e38f5-7419-40ac-a127-5da85f90edcd.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/50-d53e38f5-7419-40ac-a127-5da85f90edcd.txn new file mode 100644 index 0000000000000000000000000000000000000000..2bc8f724aaba29ed8ba9a5ab151902bc02ee2d2d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/50-d53e38f5-7419-40ac-a127-5da85f90edcd.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/51-b010e630-463f-4e74-8718-6864e19899b1.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/51-b010e630-463f-4e74-8718-6864e19899b1.txn new file mode 100644 index 0000000000000000000000000000000000000000..526a3134c8ff3423bea041f681bd06386da9cc47 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/51-b010e630-463f-4e74-8718-6864e19899b1.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/52-debae20a-46fc-4f74-bc51-ca1bede74ffa.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/52-debae20a-46fc-4f74-bc51-ca1bede74ffa.txn new file mode 100644 index 0000000000000000000000000000000000000000..47e9e0cbc12684aedc77b351c23aec2720b7b824 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/52-debae20a-46fc-4f74-bc51-ca1bede74ffa.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/53-070ac1b8-3966-44a9-964b-094817189e12.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/53-070ac1b8-3966-44a9-964b-094817189e12.txn new file mode 100644 index 0000000000000000000000000000000000000000..94ded4e0fb358ef3582f24694f22371cd71758dd Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/53-070ac1b8-3966-44a9-964b-094817189e12.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/54-34eb3da8-abfe-4c7b-b4ce-5bd9cce2cf53.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/54-34eb3da8-abfe-4c7b-b4ce-5bd9cce2cf53.txn new file mode 100644 index 0000000000000000000000000000000000000000..95f9b19a613f77a2e5afd857b2961a6a8a91c2ac Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/54-34eb3da8-abfe-4c7b-b4ce-5bd9cce2cf53.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/55-a006f23b-3a68-459d-ace3-ac871aa175d7.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/55-a006f23b-3a68-459d-ace3-ac871aa175d7.txn new file mode 100644 index 0000000000000000000000000000000000000000..c78834a1cee26337c9798265931f6f0ac3e31a1a Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/55-a006f23b-3a68-459d-ace3-ac871aa175d7.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/56-f65609a8-b644-41fe-990c-95c8c7dd1311.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/56-f65609a8-b644-41fe-990c-95c8c7dd1311.txn new file mode 100644 index 0000000000000000000000000000000000000000..d5543805e72a5bbfecc554e04469db60d01692b8 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/56-f65609a8-b644-41fe-990c-95c8c7dd1311.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/57-7292c7d5-209f-4816-90db-f65dba23f1ad.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/57-7292c7d5-209f-4816-90db-f65dba23f1ad.txn new file mode 100644 index 0000000000000000000000000000000000000000..7dc8404c71a6c5975819b22c12b4c3200425494c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/57-7292c7d5-209f-4816-90db-f65dba23f1ad.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/58-39a5f5f0-9b40-4b56-9a9e-fad2fef3a629.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/58-39a5f5f0-9b40-4b56-9a9e-fad2fef3a629.txn new file mode 100644 index 0000000000000000000000000000000000000000..0214c90c43798f60aec089fe7f3c5db8aba6e5d0 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/58-39a5f5f0-9b40-4b56-9a9e-fad2fef3a629.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/59-8e098de0-c1a6-4bbc-a60f-ea2ecfe5ce15.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/59-8e098de0-c1a6-4bbc-a60f-ea2ecfe5ce15.txn new file mode 100644 index 0000000000000000000000000000000000000000..da6f5784d341457997ff52018dda3fcc3e90c8e1 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/59-8e098de0-c1a6-4bbc-a60f-ea2ecfe5ce15.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/6-38c4c380-8164-4c08-8c1f-55c5803eb8cf.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/6-38c4c380-8164-4c08-8c1f-55c5803eb8cf.txn new file mode 100644 index 0000000000000000000000000000000000000000..37c2ed988d24b565de092b2b501282ab28733494 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/6-38c4c380-8164-4c08-8c1f-55c5803eb8cf.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/60-00d51176-c65a-4f9b-9a63-5663f8b4bfee.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/60-00d51176-c65a-4f9b-9a63-5663f8b4bfee.txn new file mode 100644 index 0000000000000000000000000000000000000000..f7fe8f83ed1f4d3c318d3ad6d54642658c43bbc3 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/60-00d51176-c65a-4f9b-9a63-5663f8b4bfee.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/61-24580f7d-0aab-4d9f-9efc-69728902ac5e.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/61-24580f7d-0aab-4d9f-9efc-69728902ac5e.txn new file mode 100644 index 0000000000000000000000000000000000000000..2613bbec754dca2e9d86fa368ab0303b1a5c8bc5 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/61-24580f7d-0aab-4d9f-9efc-69728902ac5e.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/62-b9ef8e62-863b-4096-bffd-06ca5daf636f.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/62-b9ef8e62-863b-4096-bffd-06ca5daf636f.txn new file mode 100644 index 0000000000000000000000000000000000000000..c9ff999e8dde1804ce2583fde9a6c9cd14f7cc7a Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/62-b9ef8e62-863b-4096-bffd-06ca5daf636f.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/63-3af5707b-2f21-4620-9f5d-584f98044263.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/63-3af5707b-2f21-4620-9f5d-584f98044263.txn new file mode 100644 index 0000000000000000000000000000000000000000..63d5a0178cdb21ef0bc0d2688092817853644cad Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/63-3af5707b-2f21-4620-9f5d-584f98044263.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/64-67ef2fee-ef83-4a3f-a52b-0ab7a34d25b3.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/64-67ef2fee-ef83-4a3f-a52b-0ab7a34d25b3.txn new file mode 100644 index 0000000000000000000000000000000000000000..ddfaa928dd8913b2f2c43e325fcce497a298d3c8 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/64-67ef2fee-ef83-4a3f-a52b-0ab7a34d25b3.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/65-0ce42f7b-a857-4d11-ba91-2a2b7de9b95a.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/65-0ce42f7b-a857-4d11-ba91-2a2b7de9b95a.txn new file mode 100644 index 0000000000000000000000000000000000000000..5914182f4497e2acc873dc2d5ae7e930fbe0e37e Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/65-0ce42f7b-a857-4d11-ba91-2a2b7de9b95a.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/66-99002696-db7c-4e23-ab1c-95469bf84b88.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/66-99002696-db7c-4e23-ab1c-95469bf84b88.txn new file mode 100644 index 0000000000000000000000000000000000000000..687c015c59d3aaf3d434c5dbe262dcbb3e12aad8 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/66-99002696-db7c-4e23-ab1c-95469bf84b88.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/67-f5721f32-709b-428b-b224-02978d53be8b.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/67-f5721f32-709b-428b-b224-02978d53be8b.txn new file mode 100644 index 0000000000000000000000000000000000000000..613383a1197bac849390b82fc25808ee872b39b0 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/67-f5721f32-709b-428b-b224-02978d53be8b.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/68-0384d006-3b09-4a08-aac2-1117fba83455.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/68-0384d006-3b09-4a08-aac2-1117fba83455.txn new file mode 100644 index 0000000000000000000000000000000000000000..dbc3ce5e968d910094c6f44a70e203997a08622f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/68-0384d006-3b09-4a08-aac2-1117fba83455.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/69-b7a39fcf-bdf5-4538-8d12-c09c44866305.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/69-b7a39fcf-bdf5-4538-8d12-c09c44866305.txn new file mode 100644 index 0000000000000000000000000000000000000000..a593e2f9b514f0f62b922caf7aec4f0f149993db Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/69-b7a39fcf-bdf5-4538-8d12-c09c44866305.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/7-4f66b767-5427-405a-980c-473eac75c6ca.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/7-4f66b767-5427-405a-980c-473eac75c6ca.txn new file mode 100644 index 0000000000000000000000000000000000000000..a42a3cedf6c003074e81c91e47e53b07ae86d154 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/7-4f66b767-5427-405a-980c-473eac75c6ca.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/70-4727a81a-9a7f-45f0-a692-8a4399b56f7a.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/70-4727a81a-9a7f-45f0-a692-8a4399b56f7a.txn new file mode 100644 index 0000000000000000000000000000000000000000..a1fb27907f3c9426fbefa7af12a4c551638cf67f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/70-4727a81a-9a7f-45f0-a692-8a4399b56f7a.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/71-64e96ae0-a110-489a-831c-6b47806fa437.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/71-64e96ae0-a110-489a-831c-6b47806fa437.txn new file mode 100644 index 0000000000000000000000000000000000000000..3a8c0c5eba59e50ffa28d043bf80368578f0e684 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/71-64e96ae0-a110-489a-831c-6b47806fa437.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/72-e7dc63d5-2e67-4522-8bc0-f984365e96b2.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/72-e7dc63d5-2e67-4522-8bc0-f984365e96b2.txn new file mode 100644 index 0000000000000000000000000000000000000000..2bdc4b7e83f50dd77f52861fb5bc3dfe240a29d3 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/72-e7dc63d5-2e67-4522-8bc0-f984365e96b2.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/73-fc1bb466-68cc-4afd-8ba9-da330a83b242.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/73-fc1bb466-68cc-4afd-8ba9-da330a83b242.txn new file mode 100644 index 0000000000000000000000000000000000000000..bfa0c8ed413aac4b2bc4f4cf64d5c2e3a24843c0 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/73-fc1bb466-68cc-4afd-8ba9-da330a83b242.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/74-a7bef011-a719-4210-9bad-a10a2eabc95f.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/74-a7bef011-a719-4210-9bad-a10a2eabc95f.txn new file mode 100644 index 0000000000000000000000000000000000000000..793e7c4e783054cc2eca6013514f4176170f8af0 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/74-a7bef011-a719-4210-9bad-a10a2eabc95f.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/75-64d0d4b4-86c7-47d5-9500-f5a6a310df9e.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/75-64d0d4b4-86c7-47d5-9500-f5a6a310df9e.txn new file mode 100644 index 0000000000000000000000000000000000000000..32b222c229c32d62d3bbbc4d2ff29203e43d68d4 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/75-64d0d4b4-86c7-47d5-9500-f5a6a310df9e.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/76-ca7db129-5e93-4b5f-a8d9-02a07eabef94.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/76-ca7db129-5e93-4b5f-a8d9-02a07eabef94.txn new file mode 100644 index 0000000000000000000000000000000000000000..25dfa7de78c71006be2812c40c50a6b07b523eed Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/76-ca7db129-5e93-4b5f-a8d9-02a07eabef94.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/77-745cad0e-9236-4484-90d9-07c2c5e7a80c.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/77-745cad0e-9236-4484-90d9-07c2c5e7a80c.txn new file mode 100644 index 0000000000000000000000000000000000000000..2c90bc11934de85677d2cf3552a7a2556729a9a3 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/77-745cad0e-9236-4484-90d9-07c2c5e7a80c.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/78-783c4945-a07f-4026-85f1-c08e4c71da96.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/78-783c4945-a07f-4026-85f1-c08e4c71da96.txn new file mode 100644 index 0000000000000000000000000000000000000000..a410ab00571480e290094deeac4c34ee82acb4ad Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/78-783c4945-a07f-4026-85f1-c08e4c71da96.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/79-0c23f0b2-6102-4037-a706-2f336dcbf27b.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/79-0c23f0b2-6102-4037-a706-2f336dcbf27b.txn new file mode 100644 index 0000000000000000000000000000000000000000..a71e667163a905117072f99c114e29c4361d0c7d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/79-0c23f0b2-6102-4037-a706-2f336dcbf27b.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/8-e3dca291-1e2d-4559-b52e-f9bfb4cdc2ac.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/8-e3dca291-1e2d-4559-b52e-f9bfb4cdc2ac.txn new file mode 100644 index 0000000000000000000000000000000000000000..d67d2c7823819a26609d322440831196b129b44e Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/8-e3dca291-1e2d-4559-b52e-f9bfb4cdc2ac.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/80-5d9bbe37-4c36-462a-93f0-b131761f9eb1.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/80-5d9bbe37-4c36-462a-93f0-b131761f9eb1.txn new file mode 100644 index 0000000000000000000000000000000000000000..3e2f49a681baf08c4ca2d23acd1039e93cefab97 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/80-5d9bbe37-4c36-462a-93f0-b131761f9eb1.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/81-55d41a56-7af5-4f8f-a8b3-51c049127d07.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/81-55d41a56-7af5-4f8f-a8b3-51c049127d07.txn new file mode 100644 index 0000000000000000000000000000000000000000..133e98c35fd4efd693d87b71ae3cfa1b2a8ae2de Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/81-55d41a56-7af5-4f8f-a8b3-51c049127d07.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/82-97705366-a61a-4c56-aeae-90f1e77e9e26.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/82-97705366-a61a-4c56-aeae-90f1e77e9e26.txn new file mode 100644 index 0000000000000000000000000000000000000000..4e79504c2fcba7cf5f5e10778819c84a43eb9154 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/82-97705366-a61a-4c56-aeae-90f1e77e9e26.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/83-83cc68d9-3653-4ec9-a347-8bfe8b04e480.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/83-83cc68d9-3653-4ec9-a347-8bfe8b04e480.txn new file mode 100644 index 0000000000000000000000000000000000000000..a340c01de8c96f062049224149432f4bef9c657f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/83-83cc68d9-3653-4ec9-a347-8bfe8b04e480.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/84-81ee3474-49f7-4e4c-b23c-b7d98589600e.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/84-81ee3474-49f7-4e4c-b23c-b7d98589600e.txn new file mode 100644 index 0000000000000000000000000000000000000000..ab873e52c431e2d4534857e58e10c44f8f106098 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/84-81ee3474-49f7-4e4c-b23c-b7d98589600e.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/85-840eba98-daed-4948-9072-7345af796f56.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/85-840eba98-daed-4948-9072-7345af796f56.txn new file mode 100644 index 0000000000000000000000000000000000000000..4b2515f32da9ac52659570418e6da6982f701f43 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/85-840eba98-daed-4948-9072-7345af796f56.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/86-a5be95d7-df4f-48e0-b3e6-291d1a9577c1.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/86-a5be95d7-df4f-48e0-b3e6-291d1a9577c1.txn new file mode 100644 index 0000000000000000000000000000000000000000..e380788ca6cadf4bd343f4deac0c6dfbce1d0d6c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/86-a5be95d7-df4f-48e0-b3e6-291d1a9577c1.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/87-04152bbb-6e75-40d3-ab91-d5f840e4abcf.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/87-04152bbb-6e75-40d3-ab91-d5f840e4abcf.txn new file mode 100644 index 0000000000000000000000000000000000000000..f30b2cfcb5d44fc4e9733f0eef637dac52fc067f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/87-04152bbb-6e75-40d3-ab91-d5f840e4abcf.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/88-64536f48-f507-48ba-bbb6-17b5344c454d.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/88-64536f48-f507-48ba-bbb6-17b5344c454d.txn new file mode 100644 index 0000000000000000000000000000000000000000..7d65bc2f13770ccd2a89efd8db9a932e92fbec35 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/88-64536f48-f507-48ba-bbb6-17b5344c454d.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/89-bfa77872-1758-4695-bc8f-1714edc3270f.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/89-bfa77872-1758-4695-bc8f-1714edc3270f.txn new file mode 100644 index 0000000000000000000000000000000000000000..38446eb256cd295c0adf31dcdffda2c3b9a41f5c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/89-bfa77872-1758-4695-bc8f-1714edc3270f.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/9-286ac7b0-cade-444b-8ac4-e3199fe8a04f.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/9-286ac7b0-cade-444b-8ac4-e3199fe8a04f.txn new file mode 100644 index 0000000000000000000000000000000000000000..a4d781e2701708d2b6d43e7dd2e2b1b5e45512f3 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/9-286ac7b0-cade-444b-8ac4-e3199fe8a04f.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/90-ecb1effc-4018-4a31-8ae1-7904bc98afb4.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/90-ecb1effc-4018-4a31-8ae1-7904bc98afb4.txn new file mode 100644 index 0000000000000000000000000000000000000000..1136e71fb38dffc9b4ccb0d47bac9d289b89640c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/90-ecb1effc-4018-4a31-8ae1-7904bc98afb4.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/91-c321d88c-a393-4468-8ede-e9442f5a8057.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/91-c321d88c-a393-4468-8ede-e9442f5a8057.txn new file mode 100644 index 0000000000000000000000000000000000000000..a925fe74e01b74a2fad4a380470542d7c6ee5dba Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/91-c321d88c-a393-4468-8ede-e9442f5a8057.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/92-b3b8a73e-75d0-4f1e-a1ed-e96277f6acac.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/92-b3b8a73e-75d0-4f1e-a1ed-e96277f6acac.txn new file mode 100644 index 0000000000000000000000000000000000000000..9c5cbc5fa70d3c81a50f629c5408be20ba34772e Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/92-b3b8a73e-75d0-4f1e-a1ed-e96277f6acac.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/93-fad64990-032d-4992-bd22-479425e99e08.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/93-fad64990-032d-4992-bd22-479425e99e08.txn new file mode 100644 index 0000000000000000000000000000000000000000..3ee63cf1dd282ef03d8789a73c6fa516b3f8927c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/93-fad64990-032d-4992-bd22-479425e99e08.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/94-17ade4c7-9693-4b28-aa06-f79773903a92.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/94-17ade4c7-9693-4b28-aa06-f79773903a92.txn new file mode 100644 index 0000000000000000000000000000000000000000..acd420256d30a830c4f8625b915295684eb083b5 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/94-17ade4c7-9693-4b28-aa06-f79773903a92.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/95-66b43851-aabd-49e3-8ee0-d42f468dfa25.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/95-66b43851-aabd-49e3-8ee0-d42f468dfa25.txn new file mode 100644 index 0000000000000000000000000000000000000000..bc79c58de73b8ebfcf6fc01201ef2110fe171653 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/95-66b43851-aabd-49e3-8ee0-d42f468dfa25.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/96-075c59fc-aaef-4735-b132-e8d884616b4a.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/96-075c59fc-aaef-4735-b132-e8d884616b4a.txn new file mode 100644 index 0000000000000000000000000000000000000000..3d2e38e818feb1b694fd63e464ab76c8c6abd741 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/96-075c59fc-aaef-4735-b132-e8d884616b4a.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/97-b7a7a878-33b1-4bc4-bc94-098fe982d501.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/97-b7a7a878-33b1-4bc4-bc94-098fe982d501.txn new file mode 100644 index 0000000000000000000000000000000000000000..8e3f57d9a38c7166c7ef97129f6e08f1ec5a90b9 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/97-b7a7a878-33b1-4bc4-bc94-098fe982d501.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/98-47783a3c-af82-4387-a158-1ce7b2f7ecb1.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/98-47783a3c-af82-4387-a158-1ce7b2f7ecb1.txn new file mode 100644 index 0000000000000000000000000000000000000000..b28efc4b74ad348b11616d7d0e1fff5fc56fd7b2 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/98-47783a3c-af82-4387-a158-1ce7b2f7ecb1.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/99-2d443b0e-1989-4d2a-8b52-df9d387e5d80.txn b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/99-2d443b0e-1989-4d2a-8b52-df9d387e5d80.txn new file mode 100644 index 0000000000000000000000000000000000000000..d7c065d7aac37c435f72ab42ea1d0896c2b72cc3 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_transactions/99-2d443b0e-1989-4d2a-8b52-df9d387e5d80.txn differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/1.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/1.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f99285217a3b6cb2c97437be50e163521e9a287d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/1.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/10.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/10.manifest new file mode 100644 index 0000000000000000000000000000000000000000..44e4b815674daae00461ef1a1b892d3b3aab83ac Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/10.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/100.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/100.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c19f6f1028e9b060087f34f4b1aad1c611d12751 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/100.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/101.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/101.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ab6cfaa8b50357b1ac5e97b3a495ab6149ffeb1c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/101.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/102.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/102.manifest new file mode 100644 index 0000000000000000000000000000000000000000..af91ce46a3f117b4ceb2fbc7ef81be55e44e2653 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/102.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/103.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/103.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b922594ed2f14b093b64da588940eb37d4419906 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/103.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/104.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/104.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c7811cf94c6599f6b0955015d0389e8da1a41631 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/104.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/105.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/105.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9890ba75d4539111d232c3efad005e3ae3d92e25 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/105.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/106.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/106.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2762fdaf66919d5b37a5f1cfb1a4dbce6a4406aa Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/106.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/107.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/107.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7bd8d2e4ae5dd97247d6338f14f5986b2782c329 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/107.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/108.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/108.manifest new file mode 100644 index 0000000000000000000000000000000000000000..096de8cf0fab061cb9847a24ffb6a1515aba2844 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/108.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/109.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/109.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2187843a6f37c0d6de67f98ecb23d1cb4c6e0093 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/109.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/11.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/11.manifest new file mode 100644 index 0000000000000000000000000000000000000000..16c6bd32bab0db7454cff15fa40506cc2da6323e Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/11.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/110.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/110.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bcf6798115ff8cee6e557f3f64b20d66e04acb18 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/110.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/111.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/111.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9b5f44d8074b09931cc42fc0e25d0a07b41bb4de Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/111.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/112.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/112.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6797f9a8bd56f6056aaff154e63a3fe787d4f9de Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/112.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/113.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/113.manifest new file mode 100644 index 0000000000000000000000000000000000000000..83f434dfcc092a6469c191a26538a760cac35540 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/113.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/114.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/114.manifest new file mode 100644 index 0000000000000000000000000000000000000000..474a22cab00a149562de2d1fa2f84bd0b612271c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/114.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/115.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/115.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d96c83cab10b28ed3f0dd9878fe8344d48b95318 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/115.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/116.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/116.manifest new file mode 100644 index 0000000000000000000000000000000000000000..da0d63c79864ecc93a287e0d50d6f49af902a41d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/116.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/117.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/117.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a3a24425c89e9bf6b046f45d34b548c52451e05a Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/117.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/118.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/118.manifest new file mode 100644 index 0000000000000000000000000000000000000000..de2257753cda7d4259ddcc05fd21491f54cb92ba Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/118.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/119.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/119.manifest new file mode 100644 index 0000000000000000000000000000000000000000..93ff5c503e45e30e528829f2e5cbceafbc9b2b8d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/119.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/12.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/12.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9be85b6e358e8790054fcc351db4248903d8ab5f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/12.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/120.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/120.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e0d67b5d5e2617ced46ab8ee8e9b847df1dbb778 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/120.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/121.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/121.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d8ef47c4e798963df59e4aa133f47b1ec37400f5 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/121.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/122.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/122.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1a19aa15b4ed4c38fcd2bab4330be1d303a20335 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/122.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/123.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/123.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ab47494abd9fe0d7788b130cb5053c4d1c82a51c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/123.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/124.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/124.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5afd1c1170c152599cf39f22688eb1b61730ccde Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/124.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/125.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/125.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7973b5becf0dbfb0f1dbb5c1370cf020c561fe5e Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/125.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/126.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/126.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fdad407f395a8e9ce6e68caf69591af728bb81b8 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/126.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/127.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/127.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9ddeb763a8fcade45f44892fde59bae07a469e43 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/127.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/128.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/128.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fb7a3ed0c8bf9192a0fad57ad84e82eaaa2e23e7 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/128.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/129.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/129.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0e8396f677b09f340e3bbd376815429bd6d97ee8 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/129.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/13.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/13.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0b0cd93f30659713a6c1cf2620c646e2d8f8a1f8 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/13.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/130.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/130.manifest new file mode 100644 index 0000000000000000000000000000000000000000..77348eb0828fa8e26cc91d6c5dbcfaa1d8bef5ea Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/130.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/131.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/131.manifest new file mode 100644 index 0000000000000000000000000000000000000000..40d55617b158a3688d36745d42b7539a0d48c47a Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/131.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/132.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/132.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3e6e94b32d19a69d1c49196457e361ff0e2caf59 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/132.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/133.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/133.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fe77617fadbcd62cfe07dba32681ab7ef28152da Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/133.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/134.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/134.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9883577caa9d148438af98fedeed6c09d42b8a43 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/134.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/135.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/135.manifest new file mode 100644 index 0000000000000000000000000000000000000000..da51ec9756e2faaa53d58396530c8af63b0b31c4 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/135.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/136.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/136.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a7aa30787e3003d25d49551592438bf059230666 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/136.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/137.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/137.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3028e46d9b735e3d2b516e03fa8f342bd6ca33dc Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/137.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/138.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/138.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e54f4c2f52d1e49d5eb1dc718f604d65dc6ea910 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/138.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/139.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/139.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a4010c67a55e640f356661072dfd30f753eb26b0 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/139.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/14.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/14.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b83d5bd14af205f608a4051ca76f1aa95a546d19 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/14.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/140.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/140.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8b1122e9a6dbcc82eb6a6161976599a4d17360eb Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/140.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/141.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/141.manifest new file mode 100644 index 0000000000000000000000000000000000000000..16c5a77d4ce402360273f40e9b026d77bd2ad7bc Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/141.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/142.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/142.manifest new file mode 100644 index 0000000000000000000000000000000000000000..123877bc9a124eed85b9683418ffd199a74720de Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/142.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/143.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/143.manifest new file mode 100644 index 0000000000000000000000000000000000000000..91331a63bc1681bbb588bb1f76eb64056a03dfe6 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/143.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/144.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/144.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2f87cc7ba926e83baec0f0f53dc81f8a2bd63f04 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/144.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/145.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/145.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9290f217a08e965277a1d2daaa357359bcaeb376 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/145.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/146.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/146.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e85f80d0e5b0751c104ab47872ea95291db17ed9 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/146.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/147.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/147.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fdd8b4b74e80eab899bcf3ab855deefa6aba2293 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/147.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/148.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/148.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e9892629904bc30cb1542c5e7f8e8f81aacea01a Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/148.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/149.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/149.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6fcf3c8bafe4c4982317c97262edb4b40b41abcc Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/149.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/15.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/15.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e282e517da51c1e6b559119d1025532df9635f16 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/15.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/150.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/150.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4919f7feed9d4c3e151ce0ea7fd50bb812a4a1ac Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/150.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/151.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/151.manifest new file mode 100644 index 0000000000000000000000000000000000000000..82712ebcc3636bfda0e04aa2a30b34c8e7e028da Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/151.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/152.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/152.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4862ed17b2c51926b8dc168d8e94cfd4b675a4bd Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/152.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/16.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/16.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fd0bdbb8c9500617e97044f5363e88ee2306bd0d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/16.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/17.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/17.manifest new file mode 100644 index 0000000000000000000000000000000000000000..20d973ae9fd8de31f4c08bfb4cc799ea7f9bd0ff Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/17.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/18.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/18.manifest new file mode 100644 index 0000000000000000000000000000000000000000..140e459d335c3b23bca6f04881f4f0560a9e3ee0 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/18.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/19.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/19.manifest new file mode 100644 index 0000000000000000000000000000000000000000..02294f61fc3673f584e5d653ce1ee3bde6dfdcec Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/19.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/2.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/2.manifest new file mode 100644 index 0000000000000000000000000000000000000000..329c069c8d4cec6bcf3bf77502de14c31eaf4dbf Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/2.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/20.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/20.manifest new file mode 100644 index 0000000000000000000000000000000000000000..90c95cce2ed3b3d79cc938316b6df97ed58fa79f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/20.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/21.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/21.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cc653200b52c356fcd92b8c25391ad3ca5cb4097 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/21.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/22.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/22.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4ba64274e87f78b5426b85c607932ceeb4f33b3d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/22.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/23.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/23.manifest new file mode 100644 index 0000000000000000000000000000000000000000..45e4df7156783d9b93c46d33bbaeeedf88e00de6 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/23.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/24.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/24.manifest new file mode 100644 index 0000000000000000000000000000000000000000..162de2198ca53448d6bfc0bed2738032200002b6 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/24.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/25.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/25.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ad9ebe81fd89491dfc4c332bb866d4b56f81bfd7 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/25.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/26.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/26.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0d78f8fbff9d2f54e4eb8f400651fb5013eb2899 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/26.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/27.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/27.manifest new file mode 100644 index 0000000000000000000000000000000000000000..34917bc2afc0e795c2e7ba8c52e27870943d5f0c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/27.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/28.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/28.manifest new file mode 100644 index 0000000000000000000000000000000000000000..980cafb2ca3e71945a5c98764f3a69cba3b2d0f2 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/28.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/29.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/29.manifest new file mode 100644 index 0000000000000000000000000000000000000000..79a984b54ef73d92aff6d8576e056199b404356e Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/29.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/3.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/3.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9b68aa989fa02ada0c592c0ffba57c833379355c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/3.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/30.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/30.manifest new file mode 100644 index 0000000000000000000000000000000000000000..66383abfefc688bc3a2d5938b6581e6b43d0bd30 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/30.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/31.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/31.manifest new file mode 100644 index 0000000000000000000000000000000000000000..702593e3fd182fa55c1cbd07f244ec9de6c46d39 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/31.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/32.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/32.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4e896ed0d0a7093232bc0f871797ab4fe57015e6 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/32.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/33.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/33.manifest new file mode 100644 index 0000000000000000000000000000000000000000..776b98d7f2358bb372edef2bc9381de5a47a6dab Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/33.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/34.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/34.manifest new file mode 100644 index 0000000000000000000000000000000000000000..87a64873927a5ef0392aae89e195e5c448bf7b20 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/34.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/35.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/35.manifest new file mode 100644 index 0000000000000000000000000000000000000000..63ee471fc5d96386a84bad93839c881582de3fec Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/35.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/36.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/36.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7934a28f248a85146720ed04780f7123a3cf76ee Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/36.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/37.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/37.manifest new file mode 100644 index 0000000000000000000000000000000000000000..83763aebbddfc39f95043a4e4db17598477380b4 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/37.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/38.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/38.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d01709f8ef039f30602d58745f10b3694addc41a Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/38.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/39.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/39.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7304ba23984cc1f81b297b52030e82c310ccf58f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/39.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/4.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/4.manifest new file mode 100644 index 0000000000000000000000000000000000000000..55310fe7585c58323ea1af6d02a56be8bbc6ac1e Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/4.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/40.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/40.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d5b7394605f95e6d7ae1c88db6b6d5b9682f5cf3 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/40.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/41.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/41.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0bae38d812db1193e7bf5de79614f849aa9c6ed7 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/41.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/42.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/42.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c38886568c2dd3bb927775c9bac9fbe3dc81cb3d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/42.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/43.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/43.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ba552b70bf1602f5f13e5efbc53a75706a80a3aa Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/43.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/44.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/44.manifest new file mode 100644 index 0000000000000000000000000000000000000000..64f9146a9b8c8f5921664813559a8e8e094c2909 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/44.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/45.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/45.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4ae068e1cd80b25ce0c14bacaa1402b0b7e57826 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/45.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/46.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/46.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dff669964d97e7ff74e9ea1efd77340ed9875b9b Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/46.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/47.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/47.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a1e9e801eb29ed68ad754f2c5958f053631e0b46 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/47.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/48.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/48.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c2c2c09b9d37477056466426be919d20eda66ffa Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/48.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/49.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/49.manifest new file mode 100644 index 0000000000000000000000000000000000000000..36bda54b655e7bf74ebcfa7a7302ce5909dd2915 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/49.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/5.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/5.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1b34cb06a56101189cc2e9dd2979d1558058d29c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/5.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/50.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/50.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9e216af38c44b70af5c95adbd2effe2ce52c8875 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/50.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/51.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/51.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4b9e65d63c69c01872c6669b59536d608deedf24 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/51.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/52.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/52.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5763e6b904cf1cf763e1e950042f3618abf27230 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/52.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/53.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/53.manifest new file mode 100644 index 0000000000000000000000000000000000000000..15640ea9ad14383d6ad5353e6dd7cbb6c9c912ed Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/53.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/54.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/54.manifest new file mode 100644 index 0000000000000000000000000000000000000000..89041052458032a255417e700460c7b66772e4e9 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/54.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/55.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/55.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c2ebf08030e3e87bea2bdc56c2290be16926e230 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/55.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/56.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/56.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ccf5eeb3a66dfe5b7c8a69336b11663b066ffa49 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/56.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/57.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/57.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e3453d6a8fbf86dd81e0d84eb5d4fcba3948d4d1 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/57.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/58.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/58.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dd10d093dd946a5ec6b77e1b37d55748d7769466 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/58.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/59.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/59.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1985b145e722bf5c8b8c37b3fabed64a639f48c4 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/59.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/6.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/6.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f8afe334fd389a44e0e6bf22302e8f2e1792100b Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/6.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/60.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/60.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1daf83463b3c190361ee5ec22cae0fd78bc5be2a Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/60.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/61.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/61.manifest new file mode 100644 index 0000000000000000000000000000000000000000..75cae54480e88c44b2b2920ec3a3b25062e91a93 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/61.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/62.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/62.manifest new file mode 100644 index 0000000000000000000000000000000000000000..78dc2c6ac7d7472afdbe59b974d9f761047b821f Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/62.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/63.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/63.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f35cfc966b98de193b6ca8ff390c73049853abd7 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/63.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/64.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/64.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cab45983041a7d2f6b62c333894f6fa0cf289826 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/64.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/65.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/65.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f12d22e9281c52097c498d46207847e48e516c41 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/65.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/66.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/66.manifest new file mode 100644 index 0000000000000000000000000000000000000000..933e1af7538a3136d5e5ccd686fa2c7edb6ce228 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/66.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/67.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/67.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cb7d777d6cd411a3b74420e50cfca28476532813 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/67.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/68.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/68.manifest new file mode 100644 index 0000000000000000000000000000000000000000..213f488c6094f12c32f7cec024bcad17f0d82408 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/68.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/69.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/69.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0c32c29bc39343daeebff5b8d340ed3afb2cda8d Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/69.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/7.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/7.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0e298c534677aef384a0210c8888979b61ed1f0c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/7.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/70.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/70.manifest new file mode 100644 index 0000000000000000000000000000000000000000..54c1dfbe93c9b03cd47183b67b9a7282253b9178 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/70.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/71.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/71.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f33d111342c56229fe0adba61520a0e0e6575084 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/71.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/72.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/72.manifest new file mode 100644 index 0000000000000000000000000000000000000000..08958a0ba523a7c0c29bae5b13cc998d04b1d34c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/72.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/73.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/73.manifest new file mode 100644 index 0000000000000000000000000000000000000000..409ce8273a24e38803c8490efc08178fd68d0c70 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/73.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/74.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/74.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4e95092caf7a27fe4678dbb4217d0f949e022ca3 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/74.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/75.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/75.manifest new file mode 100644 index 0000000000000000000000000000000000000000..259fa9e9a9095214dc38850f92339b2d45818381 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/75.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/76.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/76.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d077b8c9cd26b9d61d442d27678937a3bf81a120 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/76.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/77.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/77.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1eb5c89031a2592af1f43c13e4d83692351422c1 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/77.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/78.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/78.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b5538a224ee49d21a8104556ca704d586701a006 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/78.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/79.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/79.manifest new file mode 100644 index 0000000000000000000000000000000000000000..302f5435b0854e2e974c48ecf62b9e6518d9d28c Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/79.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/8.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/8.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2d7f6e099f7799248e05663f7fa30ba2e0864db1 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/8.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/80.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/80.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b293ccf12a4b5a1d836afbaf3ad89908a2d09eef Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/80.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/81.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/81.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e0750485969ef2b069ddc4e0b49a79b616302c5b Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/81.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/82.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/82.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4fbae47acc33bdadea757eebe035a8d4106b8316 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/82.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/83.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/83.manifest new file mode 100644 index 0000000000000000000000000000000000000000..981209107190e62cc4886e8abddd50ef1676a0e6 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/83.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/84.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/84.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4368cc8c21815ae1f384ed680d8613dea1c56fed Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/84.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/85.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/85.manifest new file mode 100644 index 0000000000000000000000000000000000000000..60b53778ca8e91a1ebe974d51f1866cc2b275f95 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/85.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/86.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/86.manifest new file mode 100644 index 0000000000000000000000000000000000000000..481fb325d8d394f6b9a3ec61b3e8d729a2d1fdcc Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/86.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/87.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/87.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2cdba648b759a8306458a4b9a23c5af39c9853fc Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/87.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/88.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/88.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9583895567a40ae1c04805ee8c5032745c5b4ef5 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/88.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/89.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/89.manifest new file mode 100644 index 0000000000000000000000000000000000000000..721a9e3be6dd460a7da101aa9d217e7e95255094 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/89.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/9.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/9.manifest new file mode 100644 index 0000000000000000000000000000000000000000..acb4fa3fcbb0a6e1986da436d2e1268901ea7357 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/9.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/90.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/90.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2f61ae0cacf845931b2b7362082cbd6ad2e8e3dc Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/90.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/91.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/91.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ec852e47feb1fa50f7dca3e1486afa568905ff18 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/91.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/92.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/92.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3451a845353b254d46174aa05b11523370327d36 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/92.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/93.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/93.manifest new file mode 100644 index 0000000000000000000000000000000000000000..851590df14962f82e74d7d0709631ad5292a052a Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/93.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/94.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/94.manifest new file mode 100644 index 0000000000000000000000000000000000000000..97c41941f45b36ae67d2a6a3227cfc30c3a410c2 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/94.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/95.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/95.manifest new file mode 100644 index 0000000000000000000000000000000000000000..28f0e8f2ea7255f20a1860c808b8cf949975f09a Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/95.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/96.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/96.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2b249d7d6478fd8109d656935c2aee966ed172fb Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/96.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/97.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/97.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2ce2a24a4773e36f43dd07ccd2820a0d1725cbde Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/97.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/98.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/98.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dd12c7260b34204e76f0eeb17ae1237a5b460f02 Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/98.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/_versions/99.manifest b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/99.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ce4ef1ad95d8647bb1bfa90f7fde7d435ecdd0fc Binary files /dev/null and b/.lancedb/fixed_size_chunking_BAAI.lance/_versions/99.manifest differ diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/043459a3-09d4-42e8-b6e5-aa46667f8aa4.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/043459a3-09d4-42e8-b6e5-aa46667f8aa4.lance new file mode 100644 index 0000000000000000000000000000000000000000..840b2d5dda9c224506cbe9f29c47a0cf67027243 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/043459a3-09d4-42e8-b6e5-aa46667f8aa4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24ee60096d855ca34e139e79e2a2e43206e6eac5c5a277b708ab7b9ac25a48f8 +size 147523 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/069761cc-e2d9-46ca-923a-f77a70b75831.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/069761cc-e2d9-46ca-923a-f77a70b75831.lance new file mode 100644 index 0000000000000000000000000000000000000000..9cf170a251f162e0430dae08dfeaad2a419aff4c --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/069761cc-e2d9-46ca-923a-f77a70b75831.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1da046d32d4e1ea9d6af74522c21d010a8f32998c0d019ebf7244919d3539a64 +size 147747 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/06e05227-0198-4f12-9fb9-aaa47121f366.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/06e05227-0198-4f12-9fb9-aaa47121f366.lance new file mode 100644 index 0000000000000000000000000000000000000000..707c260e710fa362dd28363a4b0c0ef82d114435 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/06e05227-0198-4f12-9fb9-aaa47121f366.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:909c270b37490feab9bfd53315a90fd5bded72d614d3886b63edb5cdb697dc90 +size 148033 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/07c81b0a-946a-4a5d-b133-d3207c6b3fbe.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/07c81b0a-946a-4a5d-b133-d3207c6b3fbe.lance new file mode 100644 index 0000000000000000000000000000000000000000..1d6769a04a0a01f9d448f7b65613487bb4979408 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/07c81b0a-946a-4a5d-b133-d3207c6b3fbe.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24cc8c836fbcdd7ab91b41c5946107bcf7d413bc970869b7cf2e6e46f9f3ce0d +size 147347 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/07d5ef30-2842-4608-aa3f-4919ce41f03c.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/07d5ef30-2842-4608-aa3f-4919ce41f03c.lance new file mode 100644 index 0000000000000000000000000000000000000000..07225e0df85cfd7fd79a95c3fd0329c48c9566e2 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/07d5ef30-2842-4608-aa3f-4919ce41f03c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbd84360359f98a64e6ab09a05c575ec170c292b33bafa2331ea1758996e5d17 +size 147055 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/09829cf7-24fa-432d-9975-52c25b41d657.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/09829cf7-24fa-432d-9975-52c25b41d657.lance new file mode 100644 index 0000000000000000000000000000000000000000..35366d95de253878a65e876fdccc7f76483f3697 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/09829cf7-24fa-432d-9975-52c25b41d657.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d658d295f6c5a78f5350d0cccbb257243f1aff31c87a1847788da033768fd92 +size 146512 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/0baea6fa-115d-47a2-9d27-b029349cfdcd.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/0baea6fa-115d-47a2-9d27-b029349cfdcd.lance new file mode 100644 index 0000000000000000000000000000000000000000..3b4176db998bf939e3010b989ff226ce60b068bc --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/0baea6fa-115d-47a2-9d27-b029349cfdcd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f7d995d42f7b96852939b38afa73cd00acbfa0f639ea029f62b7ca702fa6e0 +size 148160 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/0c12b9c3-a41c-431c-8d7b-60f52e5fd43c.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/0c12b9c3-a41c-431c-8d7b-60f52e5fd43c.lance new file mode 100644 index 0000000000000000000000000000000000000000..3c7cfee3df72dd95c0d329313a194955ed1cb93b --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/0c12b9c3-a41c-431c-8d7b-60f52e5fd43c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1716c478e175e8400c73b180368d6194e37569cdce24289e32e1add18087d7ec +size 147483 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/0ce0f390-f413-4ed8-8ccf-5fbbad989e24.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/0ce0f390-f413-4ed8-8ccf-5fbbad989e24.lance new file mode 100644 index 0000000000000000000000000000000000000000..06e865f8c87a1b5880774f4fa2303f51645c7a04 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/0ce0f390-f413-4ed8-8ccf-5fbbad989e24.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad9b092939ca8aa1540061a671ffc4051233295a1d99c93e0fd05bd174142799 +size 147841 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/0de321f9-b1ea-40b4-ba17-d0832dd69dd0.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/0de321f9-b1ea-40b4-ba17-d0832dd69dd0.lance new file mode 100644 index 0000000000000000000000000000000000000000..7b0a915393e6ffe2bb8adb56d93a748b5e3e437a --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/0de321f9-b1ea-40b4-ba17-d0832dd69dd0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16fa50720fb523ba1303c3789ccb2203deb1240cc156ce9d2815a541201286d0 +size 147758 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/0f258006-8f34-4313-ac7f-925134d87e0a.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/0f258006-8f34-4313-ac7f-925134d87e0a.lance new file mode 100644 index 0000000000000000000000000000000000000000..77cd1ee9e93285cb03ea644af650bc0bce349539 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/0f258006-8f34-4313-ac7f-925134d87e0a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c60fe519d2a379bea7574e4e4cb1849c6064acb9d234090a1dd7a599dde8cc1 +size 148105 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/0f71b73c-c5cf-4af8-a84b-90247c66664d.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/0f71b73c-c5cf-4af8-a84b-90247c66664d.lance new file mode 100644 index 0000000000000000000000000000000000000000..9ded3c9dcccb985c8cd61d7cf2bda71d8892721a --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/0f71b73c-c5cf-4af8-a84b-90247c66664d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:162c812d6150356582da5045ea08cd85988c1c69df3ee69198f485e131af60df +size 146981 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/0ffe5cff-2730-4290-be85-1726f7a2c050.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/0ffe5cff-2730-4290-be85-1726f7a2c050.lance new file mode 100644 index 0000000000000000000000000000000000000000..e30d8308460aaa960e1e74931aa53eda9477cd17 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/0ffe5cff-2730-4290-be85-1726f7a2c050.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94d4391013b1ffaf6e29ba74e104579593aa01dd71857aae817fa780c7c992fb +size 147821 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/1236ed28-9d45-4806-af52-5af285169f5e.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/1236ed28-9d45-4806-af52-5af285169f5e.lance new file mode 100644 index 0000000000000000000000000000000000000000..2d6d5d35cea8bb48808d9d3a0c186514d33b6bdc --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/1236ed28-9d45-4806-af52-5af285169f5e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42d3a4781dcc0ada79249281e654bbd23783f41ccf98113fe9c230ad3fedc1f6 +size 148246 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/142b1858-17ff-4d04-9f6d-147853d43fd7.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/142b1858-17ff-4d04-9f6d-147853d43fd7.lance new file mode 100644 index 0000000000000000000000000000000000000000..4cc76f519a225048aa2851a07312ed4a608b3f87 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/142b1858-17ff-4d04-9f6d-147853d43fd7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5943ba43dccb6bfd932891098a806db404db2764cc25d6485861b4831ac6efae +size 147568 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/15766ff8-6632-4968-bafc-4e7804f23bd0.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/15766ff8-6632-4968-bafc-4e7804f23bd0.lance new file mode 100644 index 0000000000000000000000000000000000000000..67bdc7319dbe411ecdd479ff51eefd50ac4b5d7e --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/15766ff8-6632-4968-bafc-4e7804f23bd0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c8dd0efab4da3177f95c220df3fee8270b50a31d2422cddfb0851d02af87469 +size 147609 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/1776958b-926f-4fda-98dc-2869ee7ec8e6.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/1776958b-926f-4fda-98dc-2869ee7ec8e6.lance new file mode 100644 index 0000000000000000000000000000000000000000..5ed1dacd7f1be46e4cd1456ab17a47b544395c16 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/1776958b-926f-4fda-98dc-2869ee7ec8e6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfdcf928f8cde942d157526553ba89b9f937d4b11db298dc1b1b4f8b221db6e7 +size 148157 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/1792f8d4-d157-4fdb-861b-e5a84b604196.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/1792f8d4-d157-4fdb-861b-e5a84b604196.lance new file mode 100644 index 0000000000000000000000000000000000000000..bdc2ded864a29639245ed65f813c05629504bdbf --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/1792f8d4-d157-4fdb-861b-e5a84b604196.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf464923c6b246f5f32468bcae5557acb5288bec87ea10862813b0337c0bb512 +size 148184 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/1919268d-de71-4145-a3ca-8aa8c738cb81.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/1919268d-de71-4145-a3ca-8aa8c738cb81.lance new file mode 100644 index 0000000000000000000000000000000000000000..18f3912ddaf71c5ba330a92070b3c3758ef09019 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/1919268d-de71-4145-a3ca-8aa8c738cb81.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5de37c93344a11467a98dd8775824e96329d9428ddf9d12cc2d7851e7a5f3c3 +size 147868 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/1a3bc194-d1c3-448b-b2d9-b771421f184f.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/1a3bc194-d1c3-448b-b2d9-b771421f184f.lance new file mode 100644 index 0000000000000000000000000000000000000000..b0206834adc6cd9e00b938ab4222e46798998e21 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/1a3bc194-d1c3-448b-b2d9-b771421f184f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29e4b966b31f6f80d5ce77849a50f2c382f94b2cba2b332179fd9ba204f73da4 +size 147190 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/1ef8539a-6980-477f-aa18-9834d002ee08.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/1ef8539a-6980-477f-aa18-9834d002ee08.lance new file mode 100644 index 0000000000000000000000000000000000000000..063a193cef905e35cf42b08379d8e5c2ae5b28b8 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/1ef8539a-6980-477f-aa18-9834d002ee08.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fef9926a2a5f69d7b69c8d1967c61cd0c80e78a20665e7bb8180568a64394df +size 147418 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/214c42e3-e4c3-4e66-881d-76b09d488382.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/214c42e3-e4c3-4e66-881d-76b09d488382.lance new file mode 100644 index 0000000000000000000000000000000000000000..ec10e990d8a14572abecf068b7aa73355142a72f --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/214c42e3-e4c3-4e66-881d-76b09d488382.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f891ce01a4d249c0cca0576c40444d1ee2679e6c231549a251db6b6ef40752f2 +size 146895 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/22aa13fc-e8fd-4738-a967-7e47ec8d4c31.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/22aa13fc-e8fd-4738-a967-7e47ec8d4c31.lance new file mode 100644 index 0000000000000000000000000000000000000000..38397cd713a86a45b7c3b2795e1e2411528ba111 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/22aa13fc-e8fd-4738-a967-7e47ec8d4c31.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c982b221c8c47d7109035a87a5edf29c2976426b72f4fbec60e209215ea6a948 +size 148143 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/23044aa4-eeac-4b25-ac83-5177c1184fc0.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/23044aa4-eeac-4b25-ac83-5177c1184fc0.lance new file mode 100644 index 0000000000000000000000000000000000000000..0dc5a4942cdcdef1af91731c2a012a6ed4c8f100 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/23044aa4-eeac-4b25-ac83-5177c1184fc0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eacda42f753a85836e4ae43c5a236e6f6d49b7250cb849d61fa299f6af81562 +size 147918 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/24c2b23b-d701-4d9f-9080-9cc20a4d430c.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/24c2b23b-d701-4d9f-9080-9cc20a4d430c.lance new file mode 100644 index 0000000000000000000000000000000000000000..87cef0f2d05b36ddb7e071e4ed13a0e76d6f2264 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/24c2b23b-d701-4d9f-9080-9cc20a4d430c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3210a4087a60e979ec0daafa9c57e568f9c27a0df238e3bdcff88e3e9dcf5a90 +size 148274 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/252cca4b-75ac-420a-9ec3-a3e24dbd53f6.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/252cca4b-75ac-420a-9ec3-a3e24dbd53f6.lance new file mode 100644 index 0000000000000000000000000000000000000000..3b76d2d48c401052320413f07a347a60614be78b --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/252cca4b-75ac-420a-9ec3-a3e24dbd53f6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56d021f284e352ff6e017bcee440724e359f713a1ed9e741278f0589b0f714ad +size 147142 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/29f04019-40db-4448-b11f-2eee41e3b620.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/29f04019-40db-4448-b11f-2eee41e3b620.lance new file mode 100644 index 0000000000000000000000000000000000000000..d70c2f7becb8988baea14a06b59f72d2716e07b5 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/29f04019-40db-4448-b11f-2eee41e3b620.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:024c4410452e4ba821d1ebb0aa29b0ca59466ce34a4dc2f2b0b8829818b93cf8 +size 147471 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/2d3c5476-7a9d-495b-a3af-6a33ec44e36c.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/2d3c5476-7a9d-495b-a3af-6a33ec44e36c.lance new file mode 100644 index 0000000000000000000000000000000000000000..21ee7e84548a40169f5b808b61de992fb722ddf2 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/2d3c5476-7a9d-495b-a3af-6a33ec44e36c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6009cdc0b962719d3e7268723abfc2b3281597bfa831c22135cba9a253ba3de1 +size 147818 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/2e32fc3a-59ac-43ed-a496-4fac9e8d3527.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/2e32fc3a-59ac-43ed-a496-4fac9e8d3527.lance new file mode 100644 index 0000000000000000000000000000000000000000..7d7ea8fe8bfef8a76f07cdd3e56c16b8facdcc25 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/2e32fc3a-59ac-43ed-a496-4fac9e8d3527.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50bca5e97334e440236ca65bb06f933f26d2b7f5a7a8baffd070de65dea0fe9a +size 146949 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/2fe22c3c-39cc-4cb8-a4cd-5084ee9a1f28.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/2fe22c3c-39cc-4cb8-a4cd-5084ee9a1f28.lance new file mode 100644 index 0000000000000000000000000000000000000000..8baa3ecffb26d29ec774fed614028f29591d04d6 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/2fe22c3c-39cc-4cb8-a4cd-5084ee9a1f28.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39a01f00986d82d40c944de2b7d0e039e9d8608f3dffb60cbf0adc05a6ab6a72 +size 147541 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/30998d1f-428e-46b4-a33d-0a749bf9205f.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/30998d1f-428e-46b4-a33d-0a749bf9205f.lance new file mode 100644 index 0000000000000000000000000000000000000000..c241af4a269f3fa611469f04cbde2e96c624e83f --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/30998d1f-428e-46b4-a33d-0a749bf9205f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43f95d0a098d32b4e2fd7b57079d4105c322af6720f55396662da4dbab1a1ad3 +size 148186 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/3187d129-8d7b-41d6-965b-af5acacc2279.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/3187d129-8d7b-41d6-965b-af5acacc2279.lance new file mode 100644 index 0000000000000000000000000000000000000000..23775807cf8057afd642bbe17a8d53c47174c57c --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/3187d129-8d7b-41d6-965b-af5acacc2279.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fccb63825a7defaab7d6ac2defbb4d6a1bfa885714e944d236bc5bf584539f2 +size 147477 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/35ad1603-41d6-49b9-b689-d274ead49a39.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/35ad1603-41d6-49b9-b689-d274ead49a39.lance new file mode 100644 index 0000000000000000000000000000000000000000..442c900691288a48e05d05cec50b6668d2fb8f06 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/35ad1603-41d6-49b9-b689-d274ead49a39.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:918876dfea5970abee7ce9068a1219961f5745766f86730f187ec092e9d3667b +size 146435 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/3837d093-85b6-403b-a041-e9654308395a.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/3837d093-85b6-403b-a041-e9654308395a.lance new file mode 100644 index 0000000000000000000000000000000000000000..be96538de0e903e18bb8510a2b41478e1f8ba500 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/3837d093-85b6-403b-a041-e9654308395a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be2336ef40aab76d9c8a889e91424f14d764071ca846df2c8beaa89f8bc0c2fd +size 147796 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/39806e15-8419-4e9e-82d2-baeb50355f28.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/39806e15-8419-4e9e-82d2-baeb50355f28.lance new file mode 100644 index 0000000000000000000000000000000000000000..5348a7edb0134de448f5354bb1874b391121c5e7 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/39806e15-8419-4e9e-82d2-baeb50355f28.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3ea78be77083f8bc0c4a7519e6bb4aba5c173fc727c7835cd7ef24df885f9fa +size 147774 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/3d6137c8-c138-4032-a421-bccdc96c1dee.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/3d6137c8-c138-4032-a421-bccdc96c1dee.lance new file mode 100644 index 0000000000000000000000000000000000000000..62fb2880d282a05ed793e3581e7125bb159ec0f1 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/3d6137c8-c138-4032-a421-bccdc96c1dee.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50a429eb4321fc1a5a2cabd948a4e2a4fca4de101c65bdffb328e9133e9ad572 +size 147436 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/405382cd-aa27-4ac2-b2cb-4dba65863bba.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/405382cd-aa27-4ac2-b2cb-4dba65863bba.lance new file mode 100644 index 0000000000000000000000000000000000000000..68b2dc29c0a3468b2bae854b37f46555e9a507a4 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/405382cd-aa27-4ac2-b2cb-4dba65863bba.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f12b0e89bff138135869e2527722016fbe4b4e23d59eb83a9417f1f3772d32 +size 148252 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/461297c8-97e4-4bb5-80be-077148aef9ac.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/461297c8-97e4-4bb5-80be-077148aef9ac.lance new file mode 100644 index 0000000000000000000000000000000000000000..12a95ce84c3c882d8b8903c7663d463051956ffb --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/461297c8-97e4-4bb5-80be-077148aef9ac.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:985d426c5d3e4ac9ac096827448cfa82fd53a1d756f729a4259e5531115f54da +size 147803 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/46beaf32-c66c-4f1f-ba0e-c07f48adcf07.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/46beaf32-c66c-4f1f-ba0e-c07f48adcf07.lance new file mode 100644 index 0000000000000000000000000000000000000000..22d38adba230bed1863d853e829828326c25c003 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/46beaf32-c66c-4f1f-ba0e-c07f48adcf07.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ccb6b15d34805f7fee967b197e6a0327d805232e3948bb60d789a0a6a1b4ad4 +size 147891 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/4b581433-9143-4c80-8d8c-509836005379.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/4b581433-9143-4c80-8d8c-509836005379.lance new file mode 100644 index 0000000000000000000000000000000000000000..634e0929554df4f7be8315aa172a9e4631a5d129 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/4b581433-9143-4c80-8d8c-509836005379.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd29d32af1c422ffa44642bcc17fe5e5afc168cd99f31117aa952e6fa1d0ac73 +size 147138 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/4c21e0a9-ea2e-4fc2-8c23-5c744a828733.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/4c21e0a9-ea2e-4fc2-8c23-5c744a828733.lance new file mode 100644 index 0000000000000000000000000000000000000000..9dfaabef0021d11675eb4dbca29ddc00f9f2de73 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/4c21e0a9-ea2e-4fc2-8c23-5c744a828733.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c0499d90baf7f409eb9d37db995aa34b27fcece5a11de5f7aa6e2d23f00a7ff +size 147793 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/4d44d744-7709-4a11-a600-d0c37d5a7c05.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/4d44d744-7709-4a11-a600-d0c37d5a7c05.lance new file mode 100644 index 0000000000000000000000000000000000000000..ee0180e9686dfe26f8b15108dac9fd6b2f7046d9 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/4d44d744-7709-4a11-a600-d0c37d5a7c05.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86b4b537233ef54bfc052d286262358bf391ba00a53834dcc6a88aef3a425285 +size 148084 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/4df5666f-feff-4cb1-ab8b-8fba528992c9.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/4df5666f-feff-4cb1-ab8b-8fba528992c9.lance new file mode 100644 index 0000000000000000000000000000000000000000..0c3f8c2eb1e87db25582f833905a6cd91f2e75cd --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/4df5666f-feff-4cb1-ab8b-8fba528992c9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6304c4760b680f9a55d2c4206acd39276c6bc8022238464dce9c3e68ba7105f +size 147763 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/4f113fe6-bf89-4cfd-ab51-9d2623b191fd.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/4f113fe6-bf89-4cfd-ab51-9d2623b191fd.lance new file mode 100644 index 0000000000000000000000000000000000000000..6521129a527482e352b7cee4118a8a83a1fa79b5 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/4f113fe6-bf89-4cfd-ab51-9d2623b191fd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77917d11b495ff8ea4a164104bc11fab5a63c62b70dba39672415c9b0b7703be +size 147700 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/507d5f2f-9d8b-48d6-b8c4-ecc6f3d4db15.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/507d5f2f-9d8b-48d6-b8c4-ecc6f3d4db15.lance new file mode 100644 index 0000000000000000000000000000000000000000..782d290d426bdc417030f237e7d1cf2b46ce0992 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/507d5f2f-9d8b-48d6-b8c4-ecc6f3d4db15.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9406196d4fa6a9bb776615bdb2bd2b0b89fb5b92cfaa534085a63c44055f46c +size 147878 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/51127964-4e2d-496a-b8bd-a5141cfd1798.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/51127964-4e2d-496a-b8bd-a5141cfd1798.lance new file mode 100644 index 0000000000000000000000000000000000000000..38c9ff257673c587c60b500eec0ba4b78e902271 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/51127964-4e2d-496a-b8bd-a5141cfd1798.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3d5a4dfe64e59bb881a11ce66b3069a264a064e1497eae4e2795f7b11b228a0 +size 147914 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/53c44da8-89ed-4a7e-accb-7c297aa9edff.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/53c44da8-89ed-4a7e-accb-7c297aa9edff.lance new file mode 100644 index 0000000000000000000000000000000000000000..f3cc41ebeee533432ea815a0b3cb500f728b5ebf --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/53c44da8-89ed-4a7e-accb-7c297aa9edff.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7cf1ef2ef4012955974c29afbb369605e32101f92597f137c3f2c561976f981 +size 147351 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/54a6cebb-5bb7-4fbe-91f5-2767621dc590.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/54a6cebb-5bb7-4fbe-91f5-2767621dc590.lance new file mode 100644 index 0000000000000000000000000000000000000000..bc30173678ed940ae6d1ca22ffdb4c0206b48489 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/54a6cebb-5bb7-4fbe-91f5-2767621dc590.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edd1ac69c79e86bfaa4595608e57795ad53feb52f20e8409e59247bd73aa483a +size 146860 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/56fd21d2-b7cc-4b9c-948c-870e3af85c6b.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/56fd21d2-b7cc-4b9c-948c-870e3af85c6b.lance new file mode 100644 index 0000000000000000000000000000000000000000..cb75c600caf0eabcdfc772d1787a300ee1d8cee4 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/56fd21d2-b7cc-4b9c-948c-870e3af85c6b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47543b02abb00a08e5f789dbc1b28264476c9af25e589283377d4f9025793efa +size 147369 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/58a39e45-2ba2-42ac-87df-363a1b0fd204.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/58a39e45-2ba2-42ac-87df-363a1b0fd204.lance new file mode 100644 index 0000000000000000000000000000000000000000..d1baa4e0c932f7960809dbd3f94e3f6fefd553cf --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/58a39e45-2ba2-42ac-87df-363a1b0fd204.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ba33eac9b372411255778a089fffa3e6e4a857b5eec6ad599aa75fad71b0cda +size 147732 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/58d0bc8e-dbe7-4791-8ebd-421cd8f9bf62.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/58d0bc8e-dbe7-4791-8ebd-421cd8f9bf62.lance new file mode 100644 index 0000000000000000000000000000000000000000..d299ea3fe9f575de352248be0791e15abac32694 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/58d0bc8e-dbe7-4791-8ebd-421cd8f9bf62.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffffe54129f140e69855e18bba7a3c45f4365d081ecb6cf1fcc41c96992f29ac +size 146991 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/59f912e9-d2e2-4d12-985b-5f21d8ef3608.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/59f912e9-d2e2-4d12-985b-5f21d8ef3608.lance new file mode 100644 index 0000000000000000000000000000000000000000..0a374d922002ae2f0ce2fa96e27bd604bd2867fa --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/59f912e9-d2e2-4d12-985b-5f21d8ef3608.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31e6fe4464fb71e87a200fe816ea3faf65d16ebe8eeb3fdd6a8927ef916485ba +size 147421 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/5a4c4409-6158-4264-860c-06adb56197a0.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/5a4c4409-6158-4264-860c-06adb56197a0.lance new file mode 100644 index 0000000000000000000000000000000000000000..51d5d1276ab36b4979cc018b07e3d65a30496ff8 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/5a4c4409-6158-4264-860c-06adb56197a0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:736233b55053e4bd0205868a278bc7c3d5fc7a8c5219233bdf53593a21b002a1 +size 146613 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/5aaf6931-c4aa-4b5a-bc87-0b78747889fa.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/5aaf6931-c4aa-4b5a-bc87-0b78747889fa.lance new file mode 100644 index 0000000000000000000000000000000000000000..1194043dcc1ce631e28b5ab3b645f0b06aae69f9 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/5aaf6931-c4aa-4b5a-bc87-0b78747889fa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7473a9d7a5ba0e77039a0b1d786b03beb652fb9b68811dd801128d1dfda71f70 +size 147980 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/5beaed8c-2a34-4de6-a0dd-1ee380aef978.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/5beaed8c-2a34-4de6-a0dd-1ee380aef978.lance new file mode 100644 index 0000000000000000000000000000000000000000..6045b6ffe0fea05d99f3af879a19752ae7876651 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/5beaed8c-2a34-4de6-a0dd-1ee380aef978.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cf775d3a758dad559a297d9e66bcad88acc67b810097025d5bfafdd089dbba9 +size 148244 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/5dfd2bed-e2db-4b19-9dbc-c92e476a6cb1.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/5dfd2bed-e2db-4b19-9dbc-c92e476a6cb1.lance new file mode 100644 index 0000000000000000000000000000000000000000..8b20e8dba3a7063ddfc7dba41815d467cf976954 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/5dfd2bed-e2db-4b19-9dbc-c92e476a6cb1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:172103f0bec0a810addc61ee44831db523d1a7f2283529c29a27c188a18afa34 +size 147146 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/5f488e15-ba6f-43a0-9eb4-7d8973a8a887.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/5f488e15-ba6f-43a0-9eb4-7d8973a8a887.lance new file mode 100644 index 0000000000000000000000000000000000000000..e83007af0293e18c5d55e2fd68b8c3e67cb78ce5 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/5f488e15-ba6f-43a0-9eb4-7d8973a8a887.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0afe5040c2e47906ac9fdc70b01d264bc5fefd31e2ed690012e936659dccb378 +size 148184 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/626e0f53-7eb1-4948-8f82-2318e65f57ea.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/626e0f53-7eb1-4948-8f82-2318e65f57ea.lance new file mode 100644 index 0000000000000000000000000000000000000000..0cccf45b2a7de128f6dbc6f0f6e99f22b7b9381d --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/626e0f53-7eb1-4948-8f82-2318e65f57ea.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9dca4c84f2ed438baf8677582a35686d575e4a18742041f6a0ffb303f4a56c9 +size 147090 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/628a34c8-891b-48d2-9f5e-5497ff180497.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/628a34c8-891b-48d2-9f5e-5497ff180497.lance new file mode 100644 index 0000000000000000000000000000000000000000..46bfc98591bec9a703cb013f6284d19860d537a2 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/628a34c8-891b-48d2-9f5e-5497ff180497.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a11818bd240311ae96c10a1a9a99086ce392a4ff5cb6b820f8fcda5fc3e7102f +size 147516 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/62e003a5-44fc-4f5d-a338-252ad45e5236.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/62e003a5-44fc-4f5d-a338-252ad45e5236.lance new file mode 100644 index 0000000000000000000000000000000000000000..6dccd375b3191568bf03fee426fc84354b19cc06 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/62e003a5-44fc-4f5d-a338-252ad45e5236.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4c7972550b323b547eb177df96d81e2ab9b74c432dbbcf262c8a06ff4ca43c5 +size 148309 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/63bde1e6-bb96-4cd9-b244-4f1382f60694.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/63bde1e6-bb96-4cd9-b244-4f1382f60694.lance new file mode 100644 index 0000000000000000000000000000000000000000..0bdb168bd1455e10e794361eee029667aa3b20e6 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/63bde1e6-bb96-4cd9-b244-4f1382f60694.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4bed9dbb7f596029119fc397c674241d007d749f474c8a8f4ca37af7ee9678f +size 147864 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/646dbf88-9e44-4e7a-9594-c7fcdbb2cce4.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/646dbf88-9e44-4e7a-9594-c7fcdbb2cce4.lance new file mode 100644 index 0000000000000000000000000000000000000000..7a85c6e9700f76f8d62ce05bae488da2c3aec706 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/646dbf88-9e44-4e7a-9594-c7fcdbb2cce4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40aa6caea1136b2ddc37c4bd03250169511edbc80cfce84caa8f2bdfbf9cb156 +size 146719 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/65f88b9a-5047-4ca9-8413-addb49c40c59.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/65f88b9a-5047-4ca9-8413-addb49c40c59.lance new file mode 100644 index 0000000000000000000000000000000000000000..1f7e65a85d43aeaa5f0bcc72fb9058a6031ec11c --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/65f88b9a-5047-4ca9-8413-addb49c40c59.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5a992a748b119f52c3c4a545697dba06cb25401ab67b11080bb23d9b7a4b0fd +size 147731 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/67ad1146-1e34-4c95-b631-fc9f1f8b896d.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/67ad1146-1e34-4c95-b631-fc9f1f8b896d.lance new file mode 100644 index 0000000000000000000000000000000000000000..535729710d7f41417d037bb9fe0300188e4637ad --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/67ad1146-1e34-4c95-b631-fc9f1f8b896d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36455774b130807adfe7d8e5a8d571b566705665aee2ceac12e64a9d4710cb21 +size 147732 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/686feda0-2732-49c8-b1d7-7ddfe01c00f3.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/686feda0-2732-49c8-b1d7-7ddfe01c00f3.lance new file mode 100644 index 0000000000000000000000000000000000000000..62c84d5606566ef05794bb04dfea24563577398c --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/686feda0-2732-49c8-b1d7-7ddfe01c00f3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:960a19458813b3c76686bd9655fb16497b4e4c0b8a62528b8a5c401da2411f19 +size 147342 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/698ab9ae-5e68-46d0-b900-61efb5d7b7c7.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/698ab9ae-5e68-46d0-b900-61efb5d7b7c7.lance new file mode 100644 index 0000000000000000000000000000000000000000..22523853ebba3f2002ed53975856bddb97615708 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/698ab9ae-5e68-46d0-b900-61efb5d7b7c7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af2f0276aff6735d1fcde51b5006074ed3b5d3cec737f121a05dbf72effedb6f +size 147312 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/6b391701-7ba1-44ae-81f9-38df07347314.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/6b391701-7ba1-44ae-81f9-38df07347314.lance new file mode 100644 index 0000000000000000000000000000000000000000..6f64a56039e1f70c05c2b4f9dcb02644a973e192 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/6b391701-7ba1-44ae-81f9-38df07347314.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4623820e586d2e4a1629bf18d253f7743eaaab97fa7f2c6700cd56d4363dfcb8 +size 148032 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/6cc14fe9-8728-4d08-888c-f7aeb2a96537.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/6cc14fe9-8728-4d08-888c-f7aeb2a96537.lance new file mode 100644 index 0000000000000000000000000000000000000000..566c2e5d0f6d21483b4831ac970ac4be9b870e4e --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/6cc14fe9-8728-4d08-888c-f7aeb2a96537.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01b16aed9bba687b27f39c3a40c8c94df46878a44d11481d16337a7f064d7634 +size 147180 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/6f7d5774-95ff-4210-9c1c-02c70990e99f.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/6f7d5774-95ff-4210-9c1c-02c70990e99f.lance new file mode 100644 index 0000000000000000000000000000000000000000..9ea54d82a5dd4403e2315e28c4c93b51269336e1 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/6f7d5774-95ff-4210-9c1c-02c70990e99f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2583018af8d8818dc4c4baa339eaebf9a442b62d13072af6b8a021775118f023 +size 147227 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/73605f55-e797-4e2a-87e4-9560ee781d65.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/73605f55-e797-4e2a-87e4-9560ee781d65.lance new file mode 100644 index 0000000000000000000000000000000000000000..8b13401cb5018b356f4c5aea748f2cb142417e0c --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/73605f55-e797-4e2a-87e4-9560ee781d65.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7085abdc99bd9ad51dfacb27c20e575a20d397aced4f0a7e1fd3545d7f84999d +size 147818 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/76fb1a59-060e-414c-8407-9dbf00cba8a6.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/76fb1a59-060e-414c-8407-9dbf00cba8a6.lance new file mode 100644 index 0000000000000000000000000000000000000000..44db8e20ed42e91d2b6eb96311874b31bd3f3402 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/76fb1a59-060e-414c-8407-9dbf00cba8a6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b40aafa440935a478cd8ce2381ac1407a60edc8ab2de28f6305cc31716a87c21 +size 148261 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/778105f0-88a7-4515-b4a6-08a2c4a9fac5.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/778105f0-88a7-4515-b4a6-08a2c4a9fac5.lance new file mode 100644 index 0000000000000000000000000000000000000000..fc2617ba1d3332211125ed46ec83f7e44bec50b3 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/778105f0-88a7-4515-b4a6-08a2c4a9fac5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e111805a3b9dba29539c23131411d4ba12c9f0f1dc3ded9a734bbc59c926693 +size 147561 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/77bc3677-8914-4daf-bcab-05bbcaa53b69.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/77bc3677-8914-4daf-bcab-05bbcaa53b69.lance new file mode 100644 index 0000000000000000000000000000000000000000..82a6d9dc3009148d2553bf4600e51cabf38e4191 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/77bc3677-8914-4daf-bcab-05bbcaa53b69.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46c618c22e2f8a13daf66c4a4a3ba7fa03793aa77052f30fbc811734dc9c95ac +size 147792 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/7ac30f19-61b8-442d-8175-86e2f774b280.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/7ac30f19-61b8-442d-8175-86e2f774b280.lance new file mode 100644 index 0000000000000000000000000000000000000000..a1cf84084b02a331ed36553f13aeb58fb4e87546 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/7ac30f19-61b8-442d-8175-86e2f774b280.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:110a581cec011b7f5767d69e3f0b2db3a3ad8b4209aa201761bb7b4f46bb5651 +size 147756 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/7ad1578d-6986-44ce-92e0-efd7b5aa7328.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/7ad1578d-6986-44ce-92e0-efd7b5aa7328.lance new file mode 100644 index 0000000000000000000000000000000000000000..c48786754641cdf6b61073f2269d5f1c49c6dccd --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/7ad1578d-6986-44ce-92e0-efd7b5aa7328.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:717c57da0194c0bf52d21d96573e8a3f149b580726dd41ae3f40776318391dd3 +size 146632 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/7c1a022a-2eed-44ca-bcf2-ac2650fe792e.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/7c1a022a-2eed-44ca-bcf2-ac2650fe792e.lance new file mode 100644 index 0000000000000000000000000000000000000000..1778cec7e5b0171a7c52d984ee9607c3014b990a --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/7c1a022a-2eed-44ca-bcf2-ac2650fe792e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94d57c26055eff95a8484f23efdc8d61f953d9005c044a755a69b4a6d635182f +size 147561 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/7e0f64d2-5e10-4edb-8ef1-abea65dbbad0.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/7e0f64d2-5e10-4edb-8ef1-abea65dbbad0.lance new file mode 100644 index 0000000000000000000000000000000000000000..2a6faf0b6a9d80ff610ceac1cda334ccf66d1458 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/7e0f64d2-5e10-4edb-8ef1-abea65dbbad0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65d9a8ac8d6ccea5e21f650d5353f4151df882c466a57e6b45722626ff700be5 +size 147410 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/7e15eebd-67d5-4aa8-b518-cda9d361c6a0.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/7e15eebd-67d5-4aa8-b518-cda9d361c6a0.lance new file mode 100644 index 0000000000000000000000000000000000000000..4d8cfcdfb7652fd9021da124ede1062b5dd16839 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/7e15eebd-67d5-4aa8-b518-cda9d361c6a0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e206b7271f516bdf3d7c0de8f56a6c21272544af598a6b5d0662710167cd0e0 +size 146890 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/8104280f-0e22-498e-b6e3-818280279d66.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/8104280f-0e22-498e-b6e3-818280279d66.lance new file mode 100644 index 0000000000000000000000000000000000000000..0dba43ce730c94c3ce8b44849acca8ffbab1d539 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/8104280f-0e22-498e-b6e3-818280279d66.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27ac3b384a0efa2f1b75a7f815448a46e987337fb30cb2e0b5f1e46a4f6f67ea +size 148376 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/81998293-8935-4025-a0c8-ab5fb93757d5.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/81998293-8935-4025-a0c8-ab5fb93757d5.lance new file mode 100644 index 0000000000000000000000000000000000000000..f484517ae270f5a5187d6ea579e2488ff04dc428 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/81998293-8935-4025-a0c8-ab5fb93757d5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bccf0a9edfbd27805c5c6376b260446d8a88fc50110d9f283031c536e732c29 +size 147045 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/828f018d-0fd0-445e-9bee-9a01f8ac172f.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/828f018d-0fd0-445e-9bee-9a01f8ac172f.lance new file mode 100644 index 0000000000000000000000000000000000000000..0bfe97924a358c03cf842d0d76f576ce164332d8 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/828f018d-0fd0-445e-9bee-9a01f8ac172f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10de41fc92c22676c9351792962766470b3591f20cf0f365062d475e1ee63a82 +size 147891 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/84d7c735-fe48-414c-884e-fb54bd9b2505.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/84d7c735-fe48-414c-884e-fb54bd9b2505.lance new file mode 100644 index 0000000000000000000000000000000000000000..443df1c4fc3d2e496e4999311c8917c9ddc2bbbf --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/84d7c735-fe48-414c-884e-fb54bd9b2505.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a57060459b0576befc6b001651d11ca94b6b31162066d29d98b656996f84a8ff +size 148314 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/852a17a4-9c3f-4dcc-b23e-8b47fb2b08fd.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/852a17a4-9c3f-4dcc-b23e-8b47fb2b08fd.lance new file mode 100644 index 0000000000000000000000000000000000000000..a45fdfd878366fac9eb87ee80e0dabf41c0a212f --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/852a17a4-9c3f-4dcc-b23e-8b47fb2b08fd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b23e527509949669208694c382c36a51b6ee70b5bb130953c14285906923a7f +size 146885 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/85d6ab28-ec45-4674-8e9d-d6f356a80583.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/85d6ab28-ec45-4674-8e9d-d6f356a80583.lance new file mode 100644 index 0000000000000000000000000000000000000000..ee1170aedc970869167bec3d7a38ec63c1b4fe1f --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/85d6ab28-ec45-4674-8e9d-d6f356a80583.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c38d29e09387af25dbe2d1fa30ba3f7eaa47a9d1bc0aa01c309777f24c75aba +size 147117 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/87e32afd-b026-44af-b86e-f13134a62e93.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/87e32afd-b026-44af-b86e-f13134a62e93.lance new file mode 100644 index 0000000000000000000000000000000000000000..f187f3625bcab1cae34485736e750b91f0bf9079 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/87e32afd-b026-44af-b86e-f13134a62e93.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63a69809d0a2d4b9565a12e1b17015fcef02124245a4018a0bdc5dae22d2b08b +size 148113 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/87f1f387-cedf-4332-a0d9-e094d7ef1781.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/87f1f387-cedf-4332-a0d9-e094d7ef1781.lance new file mode 100644 index 0000000000000000000000000000000000000000..a4a361fa8a4eca2de762227746f4bfc470a1e042 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/87f1f387-cedf-4332-a0d9-e094d7ef1781.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b0bd60bce94b940ffd8eaafdf042fc387bf3a92a6a6d5987874819a7556ce33 +size 148232 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/88135ae1-41d8-4c00-a1b1-6d7390d44008.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/88135ae1-41d8-4c00-a1b1-6d7390d44008.lance new file mode 100644 index 0000000000000000000000000000000000000000..1ed176b9239d79b6706e9e483a1f1c9f0eeeb3c3 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/88135ae1-41d8-4c00-a1b1-6d7390d44008.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1502934263796304003382ec498458bb98f432605cba8f8a318d8350ed72ff76 +size 148057 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/8f526de9-9419-4d2a-ab8e-305e9869a4c5.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/8f526de9-9419-4d2a-ab8e-305e9869a4c5.lance new file mode 100644 index 0000000000000000000000000000000000000000..23710ea17d03b9fb29ce4f98e2724ee1066f9aa9 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/8f526de9-9419-4d2a-ab8e-305e9869a4c5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8abf8d10661dc3c8006016af137fc41ce91f1ecb0a8da3858d2e1196b4337115 +size 148240 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/90dc1444-3c01-4a0e-b2a8-ecf2602fe94d.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/90dc1444-3c01-4a0e-b2a8-ecf2602fe94d.lance new file mode 100644 index 0000000000000000000000000000000000000000..0bb4669cd398384be4e9b181c53519fb730dc538 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/90dc1444-3c01-4a0e-b2a8-ecf2602fe94d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:082fc8a8e01bd53a02114a7ba0f35f0bdc988c2a88495aef5f71e1de0eace6ae +size 147030 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/90e0b4d2-d35d-4e33-9320-9e4e7e97d13f.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/90e0b4d2-d35d-4e33-9320-9e4e7e97d13f.lance new file mode 100644 index 0000000000000000000000000000000000000000..e8c9b9dc414db53791b08632247bed15dcb871d6 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/90e0b4d2-d35d-4e33-9320-9e4e7e97d13f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c90c975d3ae0063ca0d63e6fb1ac83cee1d80f75b782b55676a770fc2eee52e6 +size 147398 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/92cf0217-410c-4258-96c0-6f1c8a90c890.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/92cf0217-410c-4258-96c0-6f1c8a90c890.lance new file mode 100644 index 0000000000000000000000000000000000000000..67c69b1ac786f48a6846006f46dd14228a932a9a --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/92cf0217-410c-4258-96c0-6f1c8a90c890.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:707a0e12736269ab5b8290001b6b9c271b9f87270dd9db060daf56f0a386e0d3 +size 147253 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/98551adc-3bfe-447a-af3e-ba73a99595b9.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/98551adc-3bfe-447a-af3e-ba73a99595b9.lance new file mode 100644 index 0000000000000000000000000000000000000000..ebc66eab4d8ade8cb27e372cd0b2ef8edc00e2b0 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/98551adc-3bfe-447a-af3e-ba73a99595b9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40ce74e4d1cffb7f8191612b9083797d8563211c15c984ccf09564beeaab780f +size 129446 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/987859a5-9016-449b-a530-0a9fe428c7f4.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/987859a5-9016-449b-a530-0a9fe428c7f4.lance new file mode 100644 index 0000000000000000000000000000000000000000..303aa97dec7725130335f4015fa921fdc03a311a --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/987859a5-9016-449b-a530-0a9fe428c7f4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65dd1997d73497fd7ade0d33f18e7d516e05335fe20decfe6f180c80c49e1056 +size 147318 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/9880a562-5317-42e1-8bfc-de2101a54a2d.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/9880a562-5317-42e1-8bfc-de2101a54a2d.lance new file mode 100644 index 0000000000000000000000000000000000000000..70a808d57a3637522fc43105b310d45bb8fd266b --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/9880a562-5317-42e1-8bfc-de2101a54a2d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdae506113b79560f72df81d84fcd4bbdbc719f4b88d87554279d6eefeef2acf +size 147828 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/9da4345e-41db-43c8-a4d7-76905ffe6957.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/9da4345e-41db-43c8-a4d7-76905ffe6957.lance new file mode 100644 index 0000000000000000000000000000000000000000..efbb1c4c19da3159c0bfcc6e3626f99bb22d7c78 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/9da4345e-41db-43c8-a4d7-76905ffe6957.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa23fc7bdc105a6c3476d34cb7225c1e46525d159daf955d2b4c94a49bae560 +size 147606 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/9f6689d9-73d6-4ff0-82b9-4e6462e0ed44.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/9f6689d9-73d6-4ff0-82b9-4e6462e0ed44.lance new file mode 100644 index 0000000000000000000000000000000000000000..14eb1f1eacdb9d43f4a34981b9d7b06d86053f44 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/9f6689d9-73d6-4ff0-82b9-4e6462e0ed44.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8240ae0bef44be6be68a6197f8daeb363d11321b5be131fc02f433a73497e3 +size 147941 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/a1039d12-c12d-48ad-9e63-6b4f07bb7298.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/a1039d12-c12d-48ad-9e63-6b4f07bb7298.lance new file mode 100644 index 0000000000000000000000000000000000000000..39b92583c8d9f8b8ddc3495fae1da3923e7098ec --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/a1039d12-c12d-48ad-9e63-6b4f07bb7298.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44f04a7bb030432ff5bc9d17cb521aba7f152a36c5a41d2c0b73b3e70a8aef9d +size 145441 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/a2a4a942-2cd9-46fb-b7aa-802cd8ae7aa1.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/a2a4a942-2cd9-46fb-b7aa-802cd8ae7aa1.lance new file mode 100644 index 0000000000000000000000000000000000000000..29fbca20a0b29296492e616c72e82280a8573589 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/a2a4a942-2cd9-46fb-b7aa-802cd8ae7aa1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80542b300fd9b22a4c9e9cf2156eb0e2d49ad0e0a09b63f9a52df79a809e8d6d +size 148242 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/a34d712c-e60d-4f2a-a708-3ad7ecc02f88.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/a34d712c-e60d-4f2a-a708-3ad7ecc02f88.lance new file mode 100644 index 0000000000000000000000000000000000000000..7f4e341d30d9becde3193b1d4c23d8947effad3b --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/a34d712c-e60d-4f2a-a708-3ad7ecc02f88.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb9d7f339bede6319fdaed468064df0c4ef724054c96caf76965a320c7b1387f +size 146447 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/a4e9b91e-8668-488b-b19c-7b5faf5a84ba.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/a4e9b91e-8668-488b-b19c-7b5faf5a84ba.lance new file mode 100644 index 0000000000000000000000000000000000000000..839e4f28f7586a6af7f3d7d531ffeb05720a3a60 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/a4e9b91e-8668-488b-b19c-7b5faf5a84ba.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c010cf9e8d60bf76e83cd66efd95ddd31407b258689026d397513318fe0c53d +size 147951 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/a54ac2e7-242d-4c01-8be3-9000cdcc483f.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/a54ac2e7-242d-4c01-8be3-9000cdcc483f.lance new file mode 100644 index 0000000000000000000000000000000000000000..be49a6ce6950f1eb963537f2db3ce2a56d7a525f --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/a54ac2e7-242d-4c01-8be3-9000cdcc483f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e23174cea52669d65fb7b7cc21e6898016c003d59e622eb72e55ebf9dd6531e8 +size 148240 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/a7931ec1-e77f-4dbf-b975-a45dfb407451.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/a7931ec1-e77f-4dbf-b975-a45dfb407451.lance new file mode 100644 index 0000000000000000000000000000000000000000..040ea07020a35833a2244acf4c59f1c3c610e393 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/a7931ec1-e77f-4dbf-b975-a45dfb407451.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb562b9bd4927551770cbd871f537de13feeaca0a1e45f5861627f4cb7f13af1 +size 147802 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/a8993915-5399-4524-af53-ed1905848666.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/a8993915-5399-4524-af53-ed1905848666.lance new file mode 100644 index 0000000000000000000000000000000000000000..a07ecded07a3aefa6fcadd2789aaf32330a205a6 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/a8993915-5399-4524-af53-ed1905848666.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18e164e700eb0f4c8e052c7f765a876b69942e186279a39caf83f168bbfc49f8 +size 147949 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/ab8203d5-341a-40ea-a6a7-afa8f9c4a6bd.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/ab8203d5-341a-40ea-a6a7-afa8f9c4a6bd.lance new file mode 100644 index 0000000000000000000000000000000000000000..27b5e0c10090f5cc9f3936d9e0c4e15f7431e6bb --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/ab8203d5-341a-40ea-a6a7-afa8f9c4a6bd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a1d568c8140737132020e77042dbd686dd316084fa853f121c6f0889a0b67dd +size 147433 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/aef5b77f-823a-450e-be70-3ba37b1a43c8.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/aef5b77f-823a-450e-be70-3ba37b1a43c8.lance new file mode 100644 index 0000000000000000000000000000000000000000..779c3129e78b36f6787523ab1d20f7c5d18f7ec0 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/aef5b77f-823a-450e-be70-3ba37b1a43c8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e1bb17dd6751145bc21b9375091e65bc0daee68568b92d1ca46e6c9d192cf1a +size 148243 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/b1264517-5da1-41e3-8187-e32cd8af0194.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/b1264517-5da1-41e3-8187-e32cd8af0194.lance new file mode 100644 index 0000000000000000000000000000000000000000..3851d76e97344c4663c259ce8dc01589c8a47f76 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/b1264517-5da1-41e3-8187-e32cd8af0194.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826dadadd2a1cd1e26d0b86097157824d4f8031376cd461ffa0877c5faa8c624 +size 147829 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/b5ab4f01-61c8-4ab1-924b-ac12835a1b18.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/b5ab4f01-61c8-4ab1-924b-ac12835a1b18.lance new file mode 100644 index 0000000000000000000000000000000000000000..867ebb2f72584be6a593136fe6fa5edb2515721d --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/b5ab4f01-61c8-4ab1-924b-ac12835a1b18.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c43ce7c9d902136976bca69cedb15a5887b95b79465c73feb3225bcf119f4327 +size 148067 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/b5e38e28-ffd6-4cc0-a9c0-7918fae96c22.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/b5e38e28-ffd6-4cc0-a9c0-7918fae96c22.lance new file mode 100644 index 0000000000000000000000000000000000000000..a54fcfe17244d5850e0acf1352e1d7bc2b2ee564 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/b5e38e28-ffd6-4cc0-a9c0-7918fae96c22.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28dff9e0cb461a29f25873681cf431d77cfa28842565851f32152fc92c8e46d3 +size 147570 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/b738c353-ca7f-48b0-874b-53f7fb8618da.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/b738c353-ca7f-48b0-874b-53f7fb8618da.lance new file mode 100644 index 0000000000000000000000000000000000000000..df38665900c8be891e924e9a082c6db4d6c66837 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/b738c353-ca7f-48b0-874b-53f7fb8618da.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49fcf9fd988fb88528d92a32a347144f3cec135355bf037e619a5d070d35dd37 +size 148240 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/b8100ea5-50eb-4356-b8ee-ebf59336a509.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/b8100ea5-50eb-4356-b8ee-ebf59336a509.lance new file mode 100644 index 0000000000000000000000000000000000000000..b74d959657876f7ec4e509cbb1fcca69eee31058 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/b8100ea5-50eb-4356-b8ee-ebf59336a509.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de47fc1d5df54a9866c1bf177efe9942bbd0ae1aa8924fedce5f2367876cf520 +size 147870 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/ba477b4f-530b-4493-80eb-57a3c1d1872f.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/ba477b4f-530b-4493-80eb-57a3c1d1872f.lance new file mode 100644 index 0000000000000000000000000000000000000000..8d1e7bdccb2759338ec4436df9987d9a91d7fcb3 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/ba477b4f-530b-4493-80eb-57a3c1d1872f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58c7eb400a058799206c0eb1fb91a9d3f0df1f7c3bc69733f9cfac5e1028138b +size 147733 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/bb17640f-1bd3-4ba5-bc14-7d701494dcb6.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/bb17640f-1bd3-4ba5-bc14-7d701494dcb6.lance new file mode 100644 index 0000000000000000000000000000000000000000..66018deb2e66c42cf40e38c381088070aec12945 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/bb17640f-1bd3-4ba5-bc14-7d701494dcb6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b6c5bff7baca4a286ac9ff2908fce402723028e3388cab394ccb3bf8a4b3497 +size 147588 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/bb1f9b3c-52ff-4f69-a1c5-97f74d08d1d3.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/bb1f9b3c-52ff-4f69-a1c5-97f74d08d1d3.lance new file mode 100644 index 0000000000000000000000000000000000000000..d903036284e9ea6d9a9ba601b30c31c54dea55f7 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/bb1f9b3c-52ff-4f69-a1c5-97f74d08d1d3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8aadbe0e38fdb9a6b79c71bf66b1867fc252dfdbb368ef82285509b089f4f7ce +size 147818 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/bc97dd8e-6b50-45eb-aeb0-450c5223e4e5.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/bc97dd8e-6b50-45eb-aeb0-450c5223e4e5.lance new file mode 100644 index 0000000000000000000000000000000000000000..2b4a2a7a0499c86d4f739fbf67a53e8e7b7836df --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/bc97dd8e-6b50-45eb-aeb0-450c5223e4e5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffcf7774fd8e6eb409db76753bc9ed748925ebd74dc406025358a2725f2d5ffd +size 147832 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/bd0544fd-0422-48a6-b0f8-9f5d66680f12.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/bd0544fd-0422-48a6-b0f8-9f5d66680f12.lance new file mode 100644 index 0000000000000000000000000000000000000000..42e826e48d314595407bb2408b9e3a783f5e726d --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/bd0544fd-0422-48a6-b0f8-9f5d66680f12.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c633f37724adf14b91d2fc74634fb22aa87d36482e2a189dbab8044fb82894fa +size 148518 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/bdbf583c-3290-4115-a53f-ddc91cf88228.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/bdbf583c-3290-4115-a53f-ddc91cf88228.lance new file mode 100644 index 0000000000000000000000000000000000000000..e7aa728e65463933a3f7431167d59323c8f51f2b --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/bdbf583c-3290-4115-a53f-ddc91cf88228.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78f07004bb29d7a58d498b5335a3641e470732c26f0a36958afeec357ce133af +size 148009 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/c0d66caf-50e6-4322-8169-21ce79245ff3.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/c0d66caf-50e6-4322-8169-21ce79245ff3.lance new file mode 100644 index 0000000000000000000000000000000000000000..c6223b59d5511bdae6b6e039e3282d738ad28a94 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/c0d66caf-50e6-4322-8169-21ce79245ff3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:304637cbb0866dd98f300ff6b451c9d307f98b467e21ba0520f837c103919bde +size 148245 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/c273aa70-bc72-46f2-bc6f-30b12741e76d.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/c273aa70-bc72-46f2-bc6f-30b12741e76d.lance new file mode 100644 index 0000000000000000000000000000000000000000..cd97af9e6cd3f8229f4d873448d29a9a40d6decb --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/c273aa70-bc72-46f2-bc6f-30b12741e76d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3626e9e01896dd1dee50b52997b6159f4d114e986cb156f414e1b8cf00d6b9d7 +size 147574 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/c4805c0a-0b00-47af-8c42-2718d2bab97e.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/c4805c0a-0b00-47af-8c42-2718d2bab97e.lance new file mode 100644 index 0000000000000000000000000000000000000000..3467a69b58f42b8f034a6b04093c3ef40c3aacfb --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/c4805c0a-0b00-47af-8c42-2718d2bab97e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c02dfd205611230534768200a4574f1c868209f5e5ca206a8d4ebac58cf869fb +size 148240 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/c4b7feab-e60d-4a90-aace-3b287100c8eb.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/c4b7feab-e60d-4a90-aace-3b287100c8eb.lance new file mode 100644 index 0000000000000000000000000000000000000000..98756d38f8f5c1790ee22153eb22cf2de995e0e7 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/c4b7feab-e60d-4a90-aace-3b287100c8eb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76120031edf4670cfaac8d25e6d4b3e6a1d2f3ff7dbf57ea6687116898dfc3ca +size 147919 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/c87d6aaf-9cfe-419a-bfec-372b4bc3a0fd.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/c87d6aaf-9cfe-419a-bfec-372b4bc3a0fd.lance new file mode 100644 index 0000000000000000000000000000000000000000..6a476a15805d2f36e488eea6e5314db08144bdbd --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/c87d6aaf-9cfe-419a-bfec-372b4bc3a0fd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:924e1bd4cc79373d5067a72d401b1a0c2a08ed0fbfbe57a9a283ae80f166f2bf +size 147887 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/c9c2e296-0b9d-45cd-a541-7073486dc419.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/c9c2e296-0b9d-45cd-a541-7073486dc419.lance new file mode 100644 index 0000000000000000000000000000000000000000..4b273578c7d103adf5722189063a5ce48770fdcf --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/c9c2e296-0b9d-45cd-a541-7073486dc419.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:003435e602b8ec3e4659aa8a6901c219218a1c0acd5695396e5f952ae0ed2b99 +size 147639 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/cfe1803b-a01d-4575-9095-e0ebac97a7a8.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/cfe1803b-a01d-4575-9095-e0ebac97a7a8.lance new file mode 100644 index 0000000000000000000000000000000000000000..801803ad1381ae11b97e65c4157bc1b28de59619 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/cfe1803b-a01d-4575-9095-e0ebac97a7a8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72eeb08f550ba39b6f05adc92726cac0dfa2d4dd31ed4db42d5c52c7b24d1030 +size 147001 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/d2113f3c-1d2f-49eb-937c-c3cd50691d6e.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/d2113f3c-1d2f-49eb-937c-c3cd50691d6e.lance new file mode 100644 index 0000000000000000000000000000000000000000..fb27865491d9c47ea5f063c226052c3e1092ee87 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/d2113f3c-1d2f-49eb-937c-c3cd50691d6e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18711a73c9f26a61fb970dfb7e16151642b08c6c4bfe3087259e3012e6672543 +size 147957 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/d4dbe80a-5c33-4e39-94d8-8dd2be322844.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/d4dbe80a-5c33-4e39-94d8-8dd2be322844.lance new file mode 100644 index 0000000000000000000000000000000000000000..6c97e2a277653bc4a6be0c24d8ff2535b3689fab --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/d4dbe80a-5c33-4e39-94d8-8dd2be322844.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3598e0afe2816a2411839b8f13da0ad363637c83df4bdf78991f50e0cc627c56 +size 147262 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/d888bacb-1d01-4027-ad1c-34503ab6e22c.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/d888bacb-1d01-4027-ad1c-34503ab6e22c.lance new file mode 100644 index 0000000000000000000000000000000000000000..b143bda0738b455965002900202c313b1141052a --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/d888bacb-1d01-4027-ad1c-34503ab6e22c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e476206d3f3498dbd3aa7ce32e9df1a2b29d29d11a2c50e04773834d8af62b6 +size 147421 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/d9d10362-5af8-444f-81be-14c03702b21c.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/d9d10362-5af8-444f-81be-14c03702b21c.lance new file mode 100644 index 0000000000000000000000000000000000000000..92217b23751e8aae5684416745e4f0aec92e91b3 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/d9d10362-5af8-444f-81be-14c03702b21c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c07a8dc1c0a4453451ac8f3b619d1c8ab9990e912e29a90386f8e6f13a49198e +size 147974 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/daa4f090-4020-4c49-a7fa-45992994cdea.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/daa4f090-4020-4c49-a7fa-45992994cdea.lance new file mode 100644 index 0000000000000000000000000000000000000000..74480f8c0fb7354e5b9f8094602bf77136c22f41 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/daa4f090-4020-4c49-a7fa-45992994cdea.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:783420ca5a9c829aea26efe5d4812941979b7d7b05ae1e544977a6cab9ab82e5 +size 148252 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/dd728f57-509e-4399-b1ae-add48199ca4f.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/dd728f57-509e-4399-b1ae-add48199ca4f.lance new file mode 100644 index 0000000000000000000000000000000000000000..e004e41d042cff1755060f1a52cee00f95f2a708 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/dd728f57-509e-4399-b1ae-add48199ca4f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6592966ef3ce6c24d9d394cd94db57d11a8446f6c73ed4e83bc838e1c40930de +size 147879 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/deb87f87-19fb-4a92-9585-19b33fcea7d3.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/deb87f87-19fb-4a92-9585-19b33fcea7d3.lance new file mode 100644 index 0000000000000000000000000000000000000000..5cb5c1f7b7d250239e55381e863b4f477ab3cb8c --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/deb87f87-19fb-4a92-9585-19b33fcea7d3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ff1e59f716694d0616c027264531a9e5ae4509b31b0d811fa5731c268311efb +size 147580 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/e0ac9519-59a4-40cb-9cd0-82518cb646ee.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/e0ac9519-59a4-40cb-9cd0-82518cb646ee.lance new file mode 100644 index 0000000000000000000000000000000000000000..e241445f12fa566ccd9bf9cd3aa8435b0e6b1006 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/e0ac9519-59a4-40cb-9cd0-82518cb646ee.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34428bccbf474ed115fa5262432924643c3eed1bdd7080c71c8f47937e259676 +size 147857 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/e14e3cc4-64f4-49d4-b138-8690618720fc.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/e14e3cc4-64f4-49d4-b138-8690618720fc.lance new file mode 100644 index 0000000000000000000000000000000000000000..1b33f2f7b6c87b67217ff2b1ce49eb880a7b36ae --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/e14e3cc4-64f4-49d4-b138-8690618720fc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b66b89d92309f153fc07e9a8bdf149db09da532f289809e611b81cdeb3fc8b9b +size 147926 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/e3078991-8dac-4e2b-99f9-28e46ed45340.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/e3078991-8dac-4e2b-99f9-28e46ed45340.lance new file mode 100644 index 0000000000000000000000000000000000000000..a02a66823c9b7a0535c76bd7770b0f03be3c42aa --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/e3078991-8dac-4e2b-99f9-28e46ed45340.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:795546f5753846877fd36fdea9c9af8d57687b34a890d2867df8dafdde0142a6 +size 146816 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/e815d2ae-5993-4be5-ae77-833cc014b8f5.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/e815d2ae-5993-4be5-ae77-833cc014b8f5.lance new file mode 100644 index 0000000000000000000000000000000000000000..75c5b8ac5b20fd68f303212e1dd6bb3fab295b02 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/e815d2ae-5993-4be5-ae77-833cc014b8f5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:803d2e51e3ac36be6bcea8186e88ee2aec63de3e2ce3ab114864a85b8b515d3d +size 148067 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/e9ad8872-9a58-4b8b-b755-9b4bbba006cc.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/e9ad8872-9a58-4b8b-b755-9b4bbba006cc.lance new file mode 100644 index 0000000000000000000000000000000000000000..a660b83ebdf4da0b1bf124db1e98350d3e07c696 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/e9ad8872-9a58-4b8b-b755-9b4bbba006cc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:826109681bccff752b4a063ae153026b651751ea3bca1c1327bf1e23361e09ea +size 147341 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/eea52a18-d00c-429b-b328-c229883860d7.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/eea52a18-d00c-429b-b328-c229883860d7.lance new file mode 100644 index 0000000000000000000000000000000000000000..60420e8b090c221646bc0f02657a993936671527 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/eea52a18-d00c-429b-b328-c229883860d7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cadd9b69b19946d067991eab83d608eb49e5aa9140490c54f65dc485cc015b8a +size 146625 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/efab9736-2dcf-4913-b5a0-184de25d840e.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/efab9736-2dcf-4913-b5a0-184de25d840e.lance new file mode 100644 index 0000000000000000000000000000000000000000..f725b2f1dcd5b57d9634b815c9085d2c537518d6 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/efab9736-2dcf-4913-b5a0-184de25d840e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cfde247b7992edb85f7d8f066cebf215e75b71c23a36e8374aed7161048b5d9 +size 146995 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/f3d82b8e-7c8c-4de1-b1fb-93d99a85d309.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/f3d82b8e-7c8c-4de1-b1fb-93d99a85d309.lance new file mode 100644 index 0000000000000000000000000000000000000000..cafa6ae60b811da1d4ccb9e81dee3fa39066cdd9 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/f3d82b8e-7c8c-4de1-b1fb-93d99a85d309.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:598e5b13586695d98d481eaf67d1bfe27307fe5c841de5179f72ece73146e1af +size 147723 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/f5d3cb28-c7bb-4566-8faa-1b8dc708626a.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/f5d3cb28-c7bb-4566-8faa-1b8dc708626a.lance new file mode 100644 index 0000000000000000000000000000000000000000..d3d6522a2a169e0482c6a936c4d2e4329b30b11c --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/f5d3cb28-c7bb-4566-8faa-1b8dc708626a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:976f5946dc169b191e2d151f84873e35e3d5c63b7205ee19994d63b5b51c7b29 +size 147277 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/f6e919b1-2d00-4db6-bf00-aeda590bc9d8.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/f6e919b1-2d00-4db6-bf00-aeda590bc9d8.lance new file mode 100644 index 0000000000000000000000000000000000000000..e422573e8828a64fa8d84061ed107451d7d78719 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/f6e919b1-2d00-4db6-bf00-aeda590bc9d8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b14202ffb9bca7ebf90ab04ee40097c7586c26b41fdc55d6529ca78cc4deb28c +size 148280 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/f7d16e67-3b21-432e-80ca-812a61495a67.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/f7d16e67-3b21-432e-80ca-812a61495a67.lance new file mode 100644 index 0000000000000000000000000000000000000000..c9633e3ba703972d845ffe6fb11a0c209477a8fc --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/f7d16e67-3b21-432e-80ca-812a61495a67.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbf1ec4cdb0aadda3293715de95a8cab2ee1ee7492b109cd914e0acf693779fd +size 145939 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/f8a9286b-5bb0-4456-a640-58ef4d3281c3.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/f8a9286b-5bb0-4456-a640-58ef4d3281c3.lance new file mode 100644 index 0000000000000000000000000000000000000000..44674ad6b6bf178deb42b6e6d131deb582759693 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/f8a9286b-5bb0-4456-a640-58ef4d3281c3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c77d22111ec40b5a437a3487b2da60f6935ca99c7b580a22a5e0147955e0c7a +size 148051 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/f8a96255-7cfe-47a2-b0f7-89afd6ca79d1.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/f8a96255-7cfe-47a2-b0f7-89afd6ca79d1.lance new file mode 100644 index 0000000000000000000000000000000000000000..e63cd1913a08e607f0e46c17d4c1a1a461a1750d --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/f8a96255-7cfe-47a2-b0f7-89afd6ca79d1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e21b043d35be09db5fce7f3b634f3a432c266e6fc20da657862660be340ffb44 +size 148240 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/f8dc5dd9-e180-40a3-a0cd-43104cc6e088.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/f8dc5dd9-e180-40a3-a0cd-43104cc6e088.lance new file mode 100644 index 0000000000000000000000000000000000000000..0942947df8a8071bb2288c4f1cc6fdec5f2fd055 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/f8dc5dd9-e180-40a3-a0cd-43104cc6e088.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24abd18a8b84e67e4003d1c312c8a6b2fea535ce44199526b70c295694ce19a7 +size 147606 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/f9216fc8-2532-4e13-be21-9411c72e7e85.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/f9216fc8-2532-4e13-be21-9411c72e7e85.lance new file mode 100644 index 0000000000000000000000000000000000000000..6a27870cf14e462c76fbd5c16d989976b9cfbe5b --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/f9216fc8-2532-4e13-be21-9411c72e7e85.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dde173ac0afb4ca2ae4730cd56543dc0104e385e9304907945e16c019aedd633 +size 147230 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/f9d1586f-c84e-42c6-8328-68a2ac2f967a.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/f9d1586f-c84e-42c6-8328-68a2ac2f967a.lance new file mode 100644 index 0000000000000000000000000000000000000000..d6b20a7b54908d23a3115b7eff3f9c9abed3f833 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/f9d1586f-c84e-42c6-8328-68a2ac2f967a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a763ddd66b3f5a78ecfdcf39f7eb68860fad639b731020ff598031575347dae9 +size 146960 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/fa98d50a-ffa4-454f-ba07-bfa20d2b74f6.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/fa98d50a-ffa4-454f-ba07-bfa20d2b74f6.lance new file mode 100644 index 0000000000000000000000000000000000000000..29ca32b92fed37875d189f6acaebbcfc13b15abc --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/fa98d50a-ffa4-454f-ba07-bfa20d2b74f6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c494c3559edb3804039580695a6ea8a5c4ea0e229fc3e6a24a41f8ce285bf111 +size 147514 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/fb6c97a5-02f3-4345-abe7-a3b8d99dd025.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/fb6c97a5-02f3-4345-abe7-a3b8d99dd025.lance new file mode 100644 index 0000000000000000000000000000000000000000..c4e97a61bef819c2736aaab5aff0bfa40991a933 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/fb6c97a5-02f3-4345-abe7-a3b8d99dd025.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c39e75bada8ea3b61ec1878b965667367b0b85a9b4259a0788edb0a6f857b7a2 +size 148148 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/fbc20690-be17-4322-8749-76ce7a3a265e.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/fbc20690-be17-4322-8749-76ce7a3a265e.lance new file mode 100644 index 0000000000000000000000000000000000000000..b874713e253842bf6a1cf52141adead548329229 --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/fbc20690-be17-4322-8749-76ce7a3a265e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95cc74d1f6aa89b25fdaba0f27dc8f4ab952c6108706178a1997b31f29df3f7c +size 147730 diff --git a/.lancedb/fixed_size_chunking_BAAI.lance/data/fc8f4423-84cc-41fd-8ca0-6f175a755dca.lance b/.lancedb/fixed_size_chunking_BAAI.lance/data/fc8f4423-84cc-41fd-8ca0-6f175a755dca.lance new file mode 100644 index 0000000000000000000000000000000000000000..2dd82236c9dac30789af3acc3c540471e6713d3d --- /dev/null +++ b/.lancedb/fixed_size_chunking_BAAI.lance/data/fc8f4423-84cc-41fd-8ca0-6f175a755dca.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:058fc929327eddb998b4fcf6e9e8501fc5d3136d894933631dd6f83f868bbcb0 +size 147143 diff --git a/.lancedb/nltk_chunking.lance/_indices/bf91d2c3-9348-4220-92ab-c11157368788/index.idx b/.lancedb/nltk_chunking.lance/_indices/bf91d2c3-9348-4220-92ab-c11157368788/index.idx new file mode 100644 index 0000000000000000000000000000000000000000..3a2a4b31e1b99b04bc316e81a2381b0bda9d2ee7 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/_indices/bf91d2c3-9348-4220-92ab-c11157368788/index.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38ce7fd981113d4aa46d74cfc2d18680419a676e64c256d2c92d2cfb8ca0b06a +size 1038733 diff --git a/.lancedb/nltk_chunking.lance/_latest.manifest b/.lancedb/nltk_chunking.lance/_latest.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5dfa529d6ff071c0c10f0f00f8cbc15911555536 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_latest.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/0-9782e067-6de3-46d2-a022-09775d40af3a.txn b/.lancedb/nltk_chunking.lance/_transactions/0-9782e067-6de3-46d2-a022-09775d40af3a.txn new file mode 100644 index 0000000000000000000000000000000000000000..498b1220898e64848d6b6bfd7e7be9404d4cc22a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/_transactions/0-9782e067-6de3-46d2-a022-09775d40af3a.txn @@ -0,0 +1 @@ +$9782e067-6de3-46d2-a022-09775d40af3a²U2vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:38408text ÿÿÿÿÿÿÿÿÿ*string08 \ No newline at end of file diff --git a/.lancedb/nltk_chunking.lance/_transactions/1-ae08ddfa-c659-4ac7-ae63-a88022f1c1f9.txn b/.lancedb/nltk_chunking.lance/_transactions/1-ae08ddfa-c659-4ac7-ae63-a88022f1c1f9.txn new file mode 100644 index 0000000000000000000000000000000000000000..8baee6a21fd7991e186bc33fdf4fc8bcd18c72c8 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/_transactions/1-ae08ddfa-c659-4ac7-ae63-a88022f1c1f9.txn @@ -0,0 +1 @@ +$ae08ddfa-c659-4ac7-ae63-a88022f1c1f9²U2vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:38408text ÿÿÿÿÿÿÿÿÿ*string08 \ No newline at end of file diff --git a/.lancedb/nltk_chunking.lance/_transactions/10-918b3d18-cd94-41a4-976e-02f9ca27fc80.txn b/.lancedb/nltk_chunking.lance/_transactions/10-918b3d18-cd94-41a4-976e-02f9ca27fc80.txn new file mode 100644 index 0000000000000000000000000000000000000000..269ede05061044fca1a63322eabf40f3c1194745 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/10-918b3d18-cd94-41a4-976e-02f9ca27fc80.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/100-466e8934-453a-4b28-bdb9-f05b3da7487e.txn b/.lancedb/nltk_chunking.lance/_transactions/100-466e8934-453a-4b28-bdb9-f05b3da7487e.txn new file mode 100644 index 0000000000000000000000000000000000000000..0e94e72038deff5a7baca11dc31f45a17ae842a1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/100-466e8934-453a-4b28-bdb9-f05b3da7487e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/101-1f49a9de-b514-4fa0-8391-dc1c40b45fe8.txn b/.lancedb/nltk_chunking.lance/_transactions/101-1f49a9de-b514-4fa0-8391-dc1c40b45fe8.txn new file mode 100644 index 0000000000000000000000000000000000000000..7167ca29cac085d2c0fbcba5d9ee351e0dd81c9c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/101-1f49a9de-b514-4fa0-8391-dc1c40b45fe8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/102-baf2e381-7315-403e-b9ec-cb2325efa86a.txn b/.lancedb/nltk_chunking.lance/_transactions/102-baf2e381-7315-403e-b9ec-cb2325efa86a.txn new file mode 100644 index 0000000000000000000000000000000000000000..41ec9633a32b38ca25590b9286e8dbbfc5b1cd5a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/102-baf2e381-7315-403e-b9ec-cb2325efa86a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/103-d99008e1-e0f3-402b-929e-093d94da3c06.txn b/.lancedb/nltk_chunking.lance/_transactions/103-d99008e1-e0f3-402b-929e-093d94da3c06.txn new file mode 100644 index 0000000000000000000000000000000000000000..1309379add4acc18ac89bfb14e1c730e5bb1b2d3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/103-d99008e1-e0f3-402b-929e-093d94da3c06.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/104-7607a729-b03c-4eab-bcbd-2d3abe17ebce.txn b/.lancedb/nltk_chunking.lance/_transactions/104-7607a729-b03c-4eab-bcbd-2d3abe17ebce.txn new file mode 100644 index 0000000000000000000000000000000000000000..450580f4b01dfa8d200e6036f29d426ad2a31397 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/104-7607a729-b03c-4eab-bcbd-2d3abe17ebce.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/105-c5c0e11f-b549-4271-9231-46674d72b75a.txn b/.lancedb/nltk_chunking.lance/_transactions/105-c5c0e11f-b549-4271-9231-46674d72b75a.txn new file mode 100644 index 0000000000000000000000000000000000000000..b4939dbd56db49147a53b21a4c337e5af7c4fa9a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/105-c5c0e11f-b549-4271-9231-46674d72b75a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/106-e0bee6cc-2be0-4686-8a26-73b979aff80d.txn b/.lancedb/nltk_chunking.lance/_transactions/106-e0bee6cc-2be0-4686-8a26-73b979aff80d.txn new file mode 100644 index 0000000000000000000000000000000000000000..be6cd115afd9d61327277f61e8029c393f0e435f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/106-e0bee6cc-2be0-4686-8a26-73b979aff80d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/107-1df7b69d-414c-4214-a526-947affe83998.txn b/.lancedb/nltk_chunking.lance/_transactions/107-1df7b69d-414c-4214-a526-947affe83998.txn new file mode 100644 index 0000000000000000000000000000000000000000..5de8982fcf94e186fd4c2ef803f2316a22a07ca5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/107-1df7b69d-414c-4214-a526-947affe83998.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/108-b16864e1-602d-4cbe-bd4e-0b65168a0cb8.txn b/.lancedb/nltk_chunking.lance/_transactions/108-b16864e1-602d-4cbe-bd4e-0b65168a0cb8.txn new file mode 100644 index 0000000000000000000000000000000000000000..ee46bdfbc4df6a41c81ef7eaedf1323a0bc986a5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/108-b16864e1-602d-4cbe-bd4e-0b65168a0cb8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/109-201470c8-9bbc-4509-b520-e0e73773a83b.txn b/.lancedb/nltk_chunking.lance/_transactions/109-201470c8-9bbc-4509-b520-e0e73773a83b.txn new file mode 100644 index 0000000000000000000000000000000000000000..12a822dfa4156597ca28eff08415679408f46512 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/109-201470c8-9bbc-4509-b520-e0e73773a83b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/11-1c5fb947-be08-4716-be7d-985684609897.txn b/.lancedb/nltk_chunking.lance/_transactions/11-1c5fb947-be08-4716-be7d-985684609897.txn new file mode 100644 index 0000000000000000000000000000000000000000..f6e7122b13b3799a14d335d3763c7d00cb69670d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/11-1c5fb947-be08-4716-be7d-985684609897.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/110-6705cba1-586a-4a2a-966e-3b0021587192.txn b/.lancedb/nltk_chunking.lance/_transactions/110-6705cba1-586a-4a2a-966e-3b0021587192.txn new file mode 100644 index 0000000000000000000000000000000000000000..ce960a92aa23154c5ca47566e8b2e52b9155109d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/110-6705cba1-586a-4a2a-966e-3b0021587192.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/111-d691f0cb-a355-47cd-aa7f-5cb58a97f023.txn b/.lancedb/nltk_chunking.lance/_transactions/111-d691f0cb-a355-47cd-aa7f-5cb58a97f023.txn new file mode 100644 index 0000000000000000000000000000000000000000..15b824274df5bc180ebead69b107282bd5aa19de Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/111-d691f0cb-a355-47cd-aa7f-5cb58a97f023.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/112-4f6f04bb-23a9-4df3-8867-6c24a45c2808.txn b/.lancedb/nltk_chunking.lance/_transactions/112-4f6f04bb-23a9-4df3-8867-6c24a45c2808.txn new file mode 100644 index 0000000000000000000000000000000000000000..e010c797ad0c8ba6ef57a115800c070b81b4ae78 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/112-4f6f04bb-23a9-4df3-8867-6c24a45c2808.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/113-222554ef-ddb3-499c-ad67-3fe26b569792.txn b/.lancedb/nltk_chunking.lance/_transactions/113-222554ef-ddb3-499c-ad67-3fe26b569792.txn new file mode 100644 index 0000000000000000000000000000000000000000..d5e16d5e1361df6a6af1223c522dd480b0f9fa91 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/113-222554ef-ddb3-499c-ad67-3fe26b569792.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/114-945e50d1-5ea1-4b11-8a6a-840fb008f30a.txn b/.lancedb/nltk_chunking.lance/_transactions/114-945e50d1-5ea1-4b11-8a6a-840fb008f30a.txn new file mode 100644 index 0000000000000000000000000000000000000000..b612875bebf0502ffcc4846c18c3c3a2362efecc Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/114-945e50d1-5ea1-4b11-8a6a-840fb008f30a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/115-985f59e7-4e48-476e-b3c3-83dedc393905.txn b/.lancedb/nltk_chunking.lance/_transactions/115-985f59e7-4e48-476e-b3c3-83dedc393905.txn new file mode 100644 index 0000000000000000000000000000000000000000..ba868ae5730a6c3915951d536ffd380e8f85d09f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/115-985f59e7-4e48-476e-b3c3-83dedc393905.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/116-ebd86410-26d5-4ab9-84ed-b1f7c844b909.txn b/.lancedb/nltk_chunking.lance/_transactions/116-ebd86410-26d5-4ab9-84ed-b1f7c844b909.txn new file mode 100644 index 0000000000000000000000000000000000000000..a361978773fece67860b00aa7df32311935d5197 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/116-ebd86410-26d5-4ab9-84ed-b1f7c844b909.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/117-59e49428-e750-41cb-aba0-08005f7c09cd.txn b/.lancedb/nltk_chunking.lance/_transactions/117-59e49428-e750-41cb-aba0-08005f7c09cd.txn new file mode 100644 index 0000000000000000000000000000000000000000..182b61f4f96919c6813c76807b0d9b96c44a665e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/117-59e49428-e750-41cb-aba0-08005f7c09cd.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/118-a8f9c4e4-8ce3-40ae-a4a9-13393576ea70.txn b/.lancedb/nltk_chunking.lance/_transactions/118-a8f9c4e4-8ce3-40ae-a4a9-13393576ea70.txn new file mode 100644 index 0000000000000000000000000000000000000000..1581415df6b79e4ac4052da0222552fdaeee797d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/118-a8f9c4e4-8ce3-40ae-a4a9-13393576ea70.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/119-9783f46a-82ba-4466-abb8-e664b7251e58.txn b/.lancedb/nltk_chunking.lance/_transactions/119-9783f46a-82ba-4466-abb8-e664b7251e58.txn new file mode 100644 index 0000000000000000000000000000000000000000..084e60b6fc2c8d9859e010fefb003513cf740f2d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/119-9783f46a-82ba-4466-abb8-e664b7251e58.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/12-a1d5d214-afbd-4450-a607-765a5533a4c1.txn b/.lancedb/nltk_chunking.lance/_transactions/12-a1d5d214-afbd-4450-a607-765a5533a4c1.txn new file mode 100644 index 0000000000000000000000000000000000000000..50760a757d43ae6f294afc8f45c1af4487effaae Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/12-a1d5d214-afbd-4450-a607-765a5533a4c1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/120-2eb53ee3-4ca1-411f-a582-c25e1a2cd9d0.txn b/.lancedb/nltk_chunking.lance/_transactions/120-2eb53ee3-4ca1-411f-a582-c25e1a2cd9d0.txn new file mode 100644 index 0000000000000000000000000000000000000000..66499db39e96296676439d2ac17b6991dd631748 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/120-2eb53ee3-4ca1-411f-a582-c25e1a2cd9d0.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/121-bac8636e-23d9-4595-83c2-df45f3b74450.txn b/.lancedb/nltk_chunking.lance/_transactions/121-bac8636e-23d9-4595-83c2-df45f3b74450.txn new file mode 100644 index 0000000000000000000000000000000000000000..85e031ef4b86e3dd41dde4273f9891ba3b35400d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/121-bac8636e-23d9-4595-83c2-df45f3b74450.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/122-89bcf903-cfc2-4a4a-8136-d1aded10aa35.txn b/.lancedb/nltk_chunking.lance/_transactions/122-89bcf903-cfc2-4a4a-8136-d1aded10aa35.txn new file mode 100644 index 0000000000000000000000000000000000000000..fc8c6d53130ae8332b5d06754543e2083d07e4ab Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/122-89bcf903-cfc2-4a4a-8136-d1aded10aa35.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/123-9f30f4e1-dec1-4813-9386-f90f3e9ba444.txn b/.lancedb/nltk_chunking.lance/_transactions/123-9f30f4e1-dec1-4813-9386-f90f3e9ba444.txn new file mode 100644 index 0000000000000000000000000000000000000000..2eb9cac7332eb5191ea8a651ea9b6ab728efcb73 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/123-9f30f4e1-dec1-4813-9386-f90f3e9ba444.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/124-939b7ed5-fc21-4999-9f83-87fa23bf5175.txn b/.lancedb/nltk_chunking.lance/_transactions/124-939b7ed5-fc21-4999-9f83-87fa23bf5175.txn new file mode 100644 index 0000000000000000000000000000000000000000..9bee3432342412a2035fe911c92fc13955764dc3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/124-939b7ed5-fc21-4999-9f83-87fa23bf5175.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/125-85811fe6-3047-4c7a-9e20-31b4aab5adad.txn b/.lancedb/nltk_chunking.lance/_transactions/125-85811fe6-3047-4c7a-9e20-31b4aab5adad.txn new file mode 100644 index 0000000000000000000000000000000000000000..04f7d82aa197574d6a31d36486d5dfdaf7c273c0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/125-85811fe6-3047-4c7a-9e20-31b4aab5adad.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/126-eadcc6f3-af5f-498b-8eec-dd6eecc4cb31.txn b/.lancedb/nltk_chunking.lance/_transactions/126-eadcc6f3-af5f-498b-8eec-dd6eecc4cb31.txn new file mode 100644 index 0000000000000000000000000000000000000000..0bac6eae4a0e7f400758c4f3d4f48ee9e961bcf9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/126-eadcc6f3-af5f-498b-8eec-dd6eecc4cb31.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/127-dc39c202-7f16-43ca-9c4e-9845be43a34d.txn b/.lancedb/nltk_chunking.lance/_transactions/127-dc39c202-7f16-43ca-9c4e-9845be43a34d.txn new file mode 100644 index 0000000000000000000000000000000000000000..bf54c35106bde230101a8c001bbaf24e6557f56d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/127-dc39c202-7f16-43ca-9c4e-9845be43a34d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/128-12b9fde9-b7fc-4abe-ba3b-644a2e47b48c.txn b/.lancedb/nltk_chunking.lance/_transactions/128-12b9fde9-b7fc-4abe-ba3b-644a2e47b48c.txn new file mode 100644 index 0000000000000000000000000000000000000000..57216d2399a6fd374cd0d3cff9e75364fdbe434a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/128-12b9fde9-b7fc-4abe-ba3b-644a2e47b48c.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/129-f7add8d1-1b61-41c0-920d-75241183edc6.txn b/.lancedb/nltk_chunking.lance/_transactions/129-f7add8d1-1b61-41c0-920d-75241183edc6.txn new file mode 100644 index 0000000000000000000000000000000000000000..060e6f6430e199557c942cf5b3f4ed965ab1ebc8 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/129-f7add8d1-1b61-41c0-920d-75241183edc6.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/13-cac2d957-606f-4c03-98b6-11003de1ce09.txn b/.lancedb/nltk_chunking.lance/_transactions/13-cac2d957-606f-4c03-98b6-11003de1ce09.txn new file mode 100644 index 0000000000000000000000000000000000000000..bfa11766a5b32bc6938b811a708e67fc5e174253 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/13-cac2d957-606f-4c03-98b6-11003de1ce09.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/130-5f8f05c5-d201-46be-afd0-f8e7e5d6b550.txn b/.lancedb/nltk_chunking.lance/_transactions/130-5f8f05c5-d201-46be-afd0-f8e7e5d6b550.txn new file mode 100644 index 0000000000000000000000000000000000000000..e0a5817197f04bde9e86fac8bfdabc6d7de62f30 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/130-5f8f05c5-d201-46be-afd0-f8e7e5d6b550.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/131-462b41c3-bbba-4fc3-b926-110fe24e842d.txn b/.lancedb/nltk_chunking.lance/_transactions/131-462b41c3-bbba-4fc3-b926-110fe24e842d.txn new file mode 100644 index 0000000000000000000000000000000000000000..64c6086eb53a4aa2cc5f16f5e1df250251a49e82 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/131-462b41c3-bbba-4fc3-b926-110fe24e842d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/132-d5a9985b-0287-4990-a64f-ef96055022d5.txn b/.lancedb/nltk_chunking.lance/_transactions/132-d5a9985b-0287-4990-a64f-ef96055022d5.txn new file mode 100644 index 0000000000000000000000000000000000000000..95f44795a8b59e0934186d38a17996ddc553bca8 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/132-d5a9985b-0287-4990-a64f-ef96055022d5.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/133-8aa23806-befd-4e80-86a7-40ec30f7c6e8.txn b/.lancedb/nltk_chunking.lance/_transactions/133-8aa23806-befd-4e80-86a7-40ec30f7c6e8.txn new file mode 100644 index 0000000000000000000000000000000000000000..0f0fa49a36d827fe4dcd78e6b44945df30c39789 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/133-8aa23806-befd-4e80-86a7-40ec30f7c6e8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/134-56554635-7abd-4c48-b607-38fcb217ef4c.txn b/.lancedb/nltk_chunking.lance/_transactions/134-56554635-7abd-4c48-b607-38fcb217ef4c.txn new file mode 100644 index 0000000000000000000000000000000000000000..34bd887a8bdd59d3516810112e277c187fbc5ba4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/134-56554635-7abd-4c48-b607-38fcb217ef4c.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/135-b825746a-418a-4532-a798-83c5517194a8.txn b/.lancedb/nltk_chunking.lance/_transactions/135-b825746a-418a-4532-a798-83c5517194a8.txn new file mode 100644 index 0000000000000000000000000000000000000000..1d936dfd0cce8a0fb575a47336469c35de03e79a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/135-b825746a-418a-4532-a798-83c5517194a8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/136-aa979d1c-ead3-4bc0-85e9-8d31893c6d3d.txn b/.lancedb/nltk_chunking.lance/_transactions/136-aa979d1c-ead3-4bc0-85e9-8d31893c6d3d.txn new file mode 100644 index 0000000000000000000000000000000000000000..9e6c0ed555dc01e8b3f4779a3febb98e4c525dfa Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/136-aa979d1c-ead3-4bc0-85e9-8d31893c6d3d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/137-242b000c-3939-433b-9180-3f672489ef59.txn b/.lancedb/nltk_chunking.lance/_transactions/137-242b000c-3939-433b-9180-3f672489ef59.txn new file mode 100644 index 0000000000000000000000000000000000000000..f1f0453cc4aae1a5aab582879c1c02d549c68dbd Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/137-242b000c-3939-433b-9180-3f672489ef59.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/138-60974ea3-ae1a-4071-9bef-cdedcf88c15a.txn b/.lancedb/nltk_chunking.lance/_transactions/138-60974ea3-ae1a-4071-9bef-cdedcf88c15a.txn new file mode 100644 index 0000000000000000000000000000000000000000..b4bba1beaecdb51af02274376defe1bcdfa741f5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/138-60974ea3-ae1a-4071-9bef-cdedcf88c15a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/139-bf31f244-2f23-49a9-bdb0-c51e394209b9.txn b/.lancedb/nltk_chunking.lance/_transactions/139-bf31f244-2f23-49a9-bdb0-c51e394209b9.txn new file mode 100644 index 0000000000000000000000000000000000000000..fd1246a71a2b8abe95da30bc7e1db4dcceae104b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/139-bf31f244-2f23-49a9-bdb0-c51e394209b9.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/14-e141b121-3965-47aa-8561-8b875654d872.txn b/.lancedb/nltk_chunking.lance/_transactions/14-e141b121-3965-47aa-8561-8b875654d872.txn new file mode 100644 index 0000000000000000000000000000000000000000..351144c12ce1fdcb27e8ba5f8989d94e129cc890 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/14-e141b121-3965-47aa-8561-8b875654d872.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/140-6a7a4552-e8e8-4044-9210-98046e6c83ab.txn b/.lancedb/nltk_chunking.lance/_transactions/140-6a7a4552-e8e8-4044-9210-98046e6c83ab.txn new file mode 100644 index 0000000000000000000000000000000000000000..e1d60244a47a56a917f4d928f3544034f462a09c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/140-6a7a4552-e8e8-4044-9210-98046e6c83ab.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/141-ea54b71a-f376-41d4-a020-eed1f00247e6.txn b/.lancedb/nltk_chunking.lance/_transactions/141-ea54b71a-f376-41d4-a020-eed1f00247e6.txn new file mode 100644 index 0000000000000000000000000000000000000000..6b988eabf31c77452f1e1983582d7439272e6707 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/141-ea54b71a-f376-41d4-a020-eed1f00247e6.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/142-56ff2be8-faae-4c1f-88aa-5f3ac59da46e.txn b/.lancedb/nltk_chunking.lance/_transactions/142-56ff2be8-faae-4c1f-88aa-5f3ac59da46e.txn new file mode 100644 index 0000000000000000000000000000000000000000..8a028b9a526b7606b91139855e7caf214788705b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/142-56ff2be8-faae-4c1f-88aa-5f3ac59da46e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/143-523f314e-6009-4abc-938c-d5ecf06794a4.txn b/.lancedb/nltk_chunking.lance/_transactions/143-523f314e-6009-4abc-938c-d5ecf06794a4.txn new file mode 100644 index 0000000000000000000000000000000000000000..e22dfbc1485fb7cb7a2ba487917db3393d49fec3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/143-523f314e-6009-4abc-938c-d5ecf06794a4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/144-f5b1085d-fdd4-45b2-9b84-5ce7e76f4b2f.txn b/.lancedb/nltk_chunking.lance/_transactions/144-f5b1085d-fdd4-45b2-9b84-5ce7e76f4b2f.txn new file mode 100644 index 0000000000000000000000000000000000000000..71b12b0baea370e74329d3f3cd3788504357ae04 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/144-f5b1085d-fdd4-45b2-9b84-5ce7e76f4b2f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/145-a8b6b180-51b4-40e6-b3f9-992ff8bde225.txn b/.lancedb/nltk_chunking.lance/_transactions/145-a8b6b180-51b4-40e6-b3f9-992ff8bde225.txn new file mode 100644 index 0000000000000000000000000000000000000000..3738ffa93a634e18e3c9e48a243ab66884774677 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/145-a8b6b180-51b4-40e6-b3f9-992ff8bde225.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/146-5160ca6b-8fc0-412d-8ff6-26dfd6f27bba.txn b/.lancedb/nltk_chunking.lance/_transactions/146-5160ca6b-8fc0-412d-8ff6-26dfd6f27bba.txn new file mode 100644 index 0000000000000000000000000000000000000000..cd2872f441b84e13e79a365968da79d0ca2db77f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/146-5160ca6b-8fc0-412d-8ff6-26dfd6f27bba.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/147-1ac288f2-f523-49c7-9b2f-fc7f50cb78fc.txn b/.lancedb/nltk_chunking.lance/_transactions/147-1ac288f2-f523-49c7-9b2f-fc7f50cb78fc.txn new file mode 100644 index 0000000000000000000000000000000000000000..d67d30aa9a409e01b4502c6f832f973ae03f8a62 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/147-1ac288f2-f523-49c7-9b2f-fc7f50cb78fc.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/148-feda8f4f-0258-4cfc-977c-ca8ab3d1ef42.txn b/.lancedb/nltk_chunking.lance/_transactions/148-feda8f4f-0258-4cfc-977c-ca8ab3d1ef42.txn new file mode 100644 index 0000000000000000000000000000000000000000..77d4de56b232434f0ef9ffb4c1cf7807a42ec1f1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/148-feda8f4f-0258-4cfc-977c-ca8ab3d1ef42.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/149-8bd90ff3-9b75-4702-860b-a83414ed331f.txn b/.lancedb/nltk_chunking.lance/_transactions/149-8bd90ff3-9b75-4702-860b-a83414ed331f.txn new file mode 100644 index 0000000000000000000000000000000000000000..c89694ea47b397133fbcfa2a4f848e775afe613a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/149-8bd90ff3-9b75-4702-860b-a83414ed331f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/15-4c70cc27-c181-4571-9b5c-f498f909b539.txn b/.lancedb/nltk_chunking.lance/_transactions/15-4c70cc27-c181-4571-9b5c-f498f909b539.txn new file mode 100644 index 0000000000000000000000000000000000000000..7f0704f83b88bfda2191dca3230da490b8772a95 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/15-4c70cc27-c181-4571-9b5c-f498f909b539.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/150-f776d3ae-8650-4295-93b9-ead02388c2b6.txn b/.lancedb/nltk_chunking.lance/_transactions/150-f776d3ae-8650-4295-93b9-ead02388c2b6.txn new file mode 100644 index 0000000000000000000000000000000000000000..2539c2ea918502af1624d23a87feaadc8bfe5105 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/150-f776d3ae-8650-4295-93b9-ead02388c2b6.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/151-815aebcf-bac5-493e-aa3f-f59bbb9d02ac.txn b/.lancedb/nltk_chunking.lance/_transactions/151-815aebcf-bac5-493e-aa3f-f59bbb9d02ac.txn new file mode 100644 index 0000000000000000000000000000000000000000..064b2428d2c7eeef9ee3a4a1f86dd70cf7a3b23e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/151-815aebcf-bac5-493e-aa3f-f59bbb9d02ac.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/152-be3b2836-bc36-4c81-89dc-cdf24b02756c.txn b/.lancedb/nltk_chunking.lance/_transactions/152-be3b2836-bc36-4c81-89dc-cdf24b02756c.txn new file mode 100644 index 0000000000000000000000000000000000000000..a2a4902164b341578fd7604021aedcd66acaf410 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/152-be3b2836-bc36-4c81-89dc-cdf24b02756c.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/153-8cfb3068-cb16-47c4-9e81-b589fbad10b3.txn b/.lancedb/nltk_chunking.lance/_transactions/153-8cfb3068-cb16-47c4-9e81-b589fbad10b3.txn new file mode 100644 index 0000000000000000000000000000000000000000..22a8dddfc1056862bad6199a382afadee5d38ef9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/153-8cfb3068-cb16-47c4-9e81-b589fbad10b3.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/154-9241873e-4a90-4743-b38b-4596cde62d47.txn b/.lancedb/nltk_chunking.lance/_transactions/154-9241873e-4a90-4743-b38b-4596cde62d47.txn new file mode 100644 index 0000000000000000000000000000000000000000..407f57bdbe8163efd225fc89739f126ab4eb88fe Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/154-9241873e-4a90-4743-b38b-4596cde62d47.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/155-da7a0704-2575-40f8-bb75-e3ca7e6c98d9.txn b/.lancedb/nltk_chunking.lance/_transactions/155-da7a0704-2575-40f8-bb75-e3ca7e6c98d9.txn new file mode 100644 index 0000000000000000000000000000000000000000..2bef4bec9bbd46d6fe4d158a0b8f14804624e10d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/155-da7a0704-2575-40f8-bb75-e3ca7e6c98d9.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/156-099a85d7-5a17-4c5a-a0a4-98fe2354ac39.txn b/.lancedb/nltk_chunking.lance/_transactions/156-099a85d7-5a17-4c5a-a0a4-98fe2354ac39.txn new file mode 100644 index 0000000000000000000000000000000000000000..8a1f02b54a619f08cf6bedbd23f50c2887aa94db Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/156-099a85d7-5a17-4c5a-a0a4-98fe2354ac39.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/157-cd1c4231-519b-46c2-bcb2-fd52bb13c00a.txn b/.lancedb/nltk_chunking.lance/_transactions/157-cd1c4231-519b-46c2-bcb2-fd52bb13c00a.txn new file mode 100644 index 0000000000000000000000000000000000000000..18d646abaf5cecafb6f5113be5c3fcb5eae9d77e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/157-cd1c4231-519b-46c2-bcb2-fd52bb13c00a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/158-e64ead4f-e32d-474b-b46e-d0a950a6772b.txn b/.lancedb/nltk_chunking.lance/_transactions/158-e64ead4f-e32d-474b-b46e-d0a950a6772b.txn new file mode 100644 index 0000000000000000000000000000000000000000..6edddede48d86edb951b92c36b9ee76fca1a7cd2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/158-e64ead4f-e32d-474b-b46e-d0a950a6772b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/159-2995a175-c21d-4d43-b4ab-28a384edeffa.txn b/.lancedb/nltk_chunking.lance/_transactions/159-2995a175-c21d-4d43-b4ab-28a384edeffa.txn new file mode 100644 index 0000000000000000000000000000000000000000..4d20bd4c67970c106fe26dcb0f048a2b4f6ab73d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/159-2995a175-c21d-4d43-b4ab-28a384edeffa.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/16-ee73caf2-8526-4389-b4da-f9e778d75890.txn b/.lancedb/nltk_chunking.lance/_transactions/16-ee73caf2-8526-4389-b4da-f9e778d75890.txn new file mode 100644 index 0000000000000000000000000000000000000000..4f3f587ece1421e46d01edb646a5aa99f1cc303c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/16-ee73caf2-8526-4389-b4da-f9e778d75890.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/160-d8a89ac7-ba6b-4279-8ca9-d98f2194796e.txn b/.lancedb/nltk_chunking.lance/_transactions/160-d8a89ac7-ba6b-4279-8ca9-d98f2194796e.txn new file mode 100644 index 0000000000000000000000000000000000000000..b254c12fc7f6300dc3ff45157851194774a6412b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/160-d8a89ac7-ba6b-4279-8ca9-d98f2194796e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/161-b8427357-1284-416a-a4f9-6e3abbfbb9f4.txn b/.lancedb/nltk_chunking.lance/_transactions/161-b8427357-1284-416a-a4f9-6e3abbfbb9f4.txn new file mode 100644 index 0000000000000000000000000000000000000000..86a0a03014bd3f057aa7641bd6b0661bd666a926 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/161-b8427357-1284-416a-a4f9-6e3abbfbb9f4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/162-0fffb2e2-07b5-4151-8297-b5ce3e29e852.txn b/.lancedb/nltk_chunking.lance/_transactions/162-0fffb2e2-07b5-4151-8297-b5ce3e29e852.txn new file mode 100644 index 0000000000000000000000000000000000000000..bfbfc6c01442314706b971e6be34e1e92ceb424a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/162-0fffb2e2-07b5-4151-8297-b5ce3e29e852.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/163-86ca65dc-8b8c-4003-938e-ce1a784c1616.txn b/.lancedb/nltk_chunking.lance/_transactions/163-86ca65dc-8b8c-4003-938e-ce1a784c1616.txn new file mode 100644 index 0000000000000000000000000000000000000000..56a13dd0bfc098b641939dfae9fa79da9e6129e0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/163-86ca65dc-8b8c-4003-938e-ce1a784c1616.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/164-b7d6e786-5889-46b4-94cc-8fb14ce52b28.txn b/.lancedb/nltk_chunking.lance/_transactions/164-b7d6e786-5889-46b4-94cc-8fb14ce52b28.txn new file mode 100644 index 0000000000000000000000000000000000000000..e42a7ae9bcc560623f25dee6032f4a0d835d6070 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/164-b7d6e786-5889-46b4-94cc-8fb14ce52b28.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/165-8552a5f5-9d09-4690-9c40-35f1c8a2bb8a.txn b/.lancedb/nltk_chunking.lance/_transactions/165-8552a5f5-9d09-4690-9c40-35f1c8a2bb8a.txn new file mode 100644 index 0000000000000000000000000000000000000000..7c15f5558a4ee10ab46bfa90b5d8e848783c5455 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/165-8552a5f5-9d09-4690-9c40-35f1c8a2bb8a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/166-b3f0f273-e90f-466e-806c-bb1fb41dfd2b.txn b/.lancedb/nltk_chunking.lance/_transactions/166-b3f0f273-e90f-466e-806c-bb1fb41dfd2b.txn new file mode 100644 index 0000000000000000000000000000000000000000..5b0a00c0ece289584f3764efba856df19799b45c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/166-b3f0f273-e90f-466e-806c-bb1fb41dfd2b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/167-6597a4d6-59bc-487c-b411-3f0110968f41.txn b/.lancedb/nltk_chunking.lance/_transactions/167-6597a4d6-59bc-487c-b411-3f0110968f41.txn new file mode 100644 index 0000000000000000000000000000000000000000..455e6988e3e8fe36065e8ca4dde12d3051bed7c2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/167-6597a4d6-59bc-487c-b411-3f0110968f41.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/168-75004c55-c45e-46f4-986e-40315c6a2332.txn b/.lancedb/nltk_chunking.lance/_transactions/168-75004c55-c45e-46f4-986e-40315c6a2332.txn new file mode 100644 index 0000000000000000000000000000000000000000..edc58f90ee933e3c23899d86dfe708ea2f6b7a69 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/168-75004c55-c45e-46f4-986e-40315c6a2332.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/169-70c1a7a0-6bd1-464b-8f4c-e1159b6ed598.txn b/.lancedb/nltk_chunking.lance/_transactions/169-70c1a7a0-6bd1-464b-8f4c-e1159b6ed598.txn new file mode 100644 index 0000000000000000000000000000000000000000..c377c8a77f7d6ff7d32498d5c092b738eac81230 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/169-70c1a7a0-6bd1-464b-8f4c-e1159b6ed598.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/17-85d59496-7c7f-40cb-9c00-31c0e96e2b09.txn b/.lancedb/nltk_chunking.lance/_transactions/17-85d59496-7c7f-40cb-9c00-31c0e96e2b09.txn new file mode 100644 index 0000000000000000000000000000000000000000..39733843c3db2cc33985cb478d319279fd444599 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/17-85d59496-7c7f-40cb-9c00-31c0e96e2b09.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/170-b672bef5-675d-48d5-a722-30afd141c4b6.txn b/.lancedb/nltk_chunking.lance/_transactions/170-b672bef5-675d-48d5-a722-30afd141c4b6.txn new file mode 100644 index 0000000000000000000000000000000000000000..2afdff96b091b0ada4a313818ec46143abfeec01 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/170-b672bef5-675d-48d5-a722-30afd141c4b6.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/171-f3748c75-c098-488f-bbcc-f42c3f19067b.txn b/.lancedb/nltk_chunking.lance/_transactions/171-f3748c75-c098-488f-bbcc-f42c3f19067b.txn new file mode 100644 index 0000000000000000000000000000000000000000..a2b659a7f20b48b15f6c1a9f94c6a1216eccac3e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/171-f3748c75-c098-488f-bbcc-f42c3f19067b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/172-4dce56c3-0861-4ad9-b424-ff79f39b96cb.txn b/.lancedb/nltk_chunking.lance/_transactions/172-4dce56c3-0861-4ad9-b424-ff79f39b96cb.txn new file mode 100644 index 0000000000000000000000000000000000000000..951e9736d8f302c9c3c6c8f284654d737c34f772 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/172-4dce56c3-0861-4ad9-b424-ff79f39b96cb.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/173-1678002a-930c-4169-8695-eb74dc5ee882.txn b/.lancedb/nltk_chunking.lance/_transactions/173-1678002a-930c-4169-8695-eb74dc5ee882.txn new file mode 100644 index 0000000000000000000000000000000000000000..14bc75bb3d11103623e3cc3c517d0874c1811f93 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/173-1678002a-930c-4169-8695-eb74dc5ee882.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/174-8e9ad25c-6ee9-4cc8-90bd-917f045efaf8.txn b/.lancedb/nltk_chunking.lance/_transactions/174-8e9ad25c-6ee9-4cc8-90bd-917f045efaf8.txn new file mode 100644 index 0000000000000000000000000000000000000000..fd48200387418a272442bca9dca5cfb4d95247be Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/174-8e9ad25c-6ee9-4cc8-90bd-917f045efaf8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/175-b0318767-2513-47df-a331-99f488193239.txn b/.lancedb/nltk_chunking.lance/_transactions/175-b0318767-2513-47df-a331-99f488193239.txn new file mode 100644 index 0000000000000000000000000000000000000000..8f795528dcfec046763d9b82e8d17af0148676ce Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/175-b0318767-2513-47df-a331-99f488193239.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/176-d9c659f7-2cfd-4588-b743-8ae4393d4d88.txn b/.lancedb/nltk_chunking.lance/_transactions/176-d9c659f7-2cfd-4588-b743-8ae4393d4d88.txn new file mode 100644 index 0000000000000000000000000000000000000000..a26f798fdb4c09777645376903e63593050c4557 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/176-d9c659f7-2cfd-4588-b743-8ae4393d4d88.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/177-9db26dbb-44a7-4e2d-9eee-71817cc9178e.txn b/.lancedb/nltk_chunking.lance/_transactions/177-9db26dbb-44a7-4e2d-9eee-71817cc9178e.txn new file mode 100644 index 0000000000000000000000000000000000000000..3df45262552f7468529a0d17868480044295ced1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/177-9db26dbb-44a7-4e2d-9eee-71817cc9178e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/178-a03f82ae-b66a-4ae3-96c1-f1ec44ce62ca.txn b/.lancedb/nltk_chunking.lance/_transactions/178-a03f82ae-b66a-4ae3-96c1-f1ec44ce62ca.txn new file mode 100644 index 0000000000000000000000000000000000000000..39dd500a46b1d78b57c3b5f1bdd914ab5286cb16 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/178-a03f82ae-b66a-4ae3-96c1-f1ec44ce62ca.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/179-da1699eb-24cd-4e48-a2af-8d73d29424ec.txn b/.lancedb/nltk_chunking.lance/_transactions/179-da1699eb-24cd-4e48-a2af-8d73d29424ec.txn new file mode 100644 index 0000000000000000000000000000000000000000..4265f68c4b2e47e43cd6e14fe9ab0841fff20b68 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/179-da1699eb-24cd-4e48-a2af-8d73d29424ec.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/18-afeb2471-d511-476d-8c62-8866737365f9.txn b/.lancedb/nltk_chunking.lance/_transactions/18-afeb2471-d511-476d-8c62-8866737365f9.txn new file mode 100644 index 0000000000000000000000000000000000000000..bf74b22603e2a395d092a8c7fd58f708b50c6780 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/18-afeb2471-d511-476d-8c62-8866737365f9.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/180-43b86456-439e-4a56-a942-b95340b75766.txn b/.lancedb/nltk_chunking.lance/_transactions/180-43b86456-439e-4a56-a942-b95340b75766.txn new file mode 100644 index 0000000000000000000000000000000000000000..15d587237e44c9e2366966801a28a64957a1980b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/180-43b86456-439e-4a56-a942-b95340b75766.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/181-744215d5-a5d4-47d7-aef2-18a7711aeb2f.txn b/.lancedb/nltk_chunking.lance/_transactions/181-744215d5-a5d4-47d7-aef2-18a7711aeb2f.txn new file mode 100644 index 0000000000000000000000000000000000000000..75489f8cd967b82e4a74fb93ee26da48df202749 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/181-744215d5-a5d4-47d7-aef2-18a7711aeb2f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/182-d8189352-a8e3-4824-a8a0-0eb1e90c9fe1.txn b/.lancedb/nltk_chunking.lance/_transactions/182-d8189352-a8e3-4824-a8a0-0eb1e90c9fe1.txn new file mode 100644 index 0000000000000000000000000000000000000000..135bcf32781b2d932c63474061158417cf54cf67 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/182-d8189352-a8e3-4824-a8a0-0eb1e90c9fe1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/183-2946da79-aaaf-4ce3-9a2c-636fb95fa40e.txn b/.lancedb/nltk_chunking.lance/_transactions/183-2946da79-aaaf-4ce3-9a2c-636fb95fa40e.txn new file mode 100644 index 0000000000000000000000000000000000000000..8f5ac815c102c427bee6625a63c54005201c5681 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/183-2946da79-aaaf-4ce3-9a2c-636fb95fa40e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/184-d4913f48-51eb-45fb-8980-a6cff123b287.txn b/.lancedb/nltk_chunking.lance/_transactions/184-d4913f48-51eb-45fb-8980-a6cff123b287.txn new file mode 100644 index 0000000000000000000000000000000000000000..35964f42c3c36fff8177f240cc889a8dc1846f08 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/184-d4913f48-51eb-45fb-8980-a6cff123b287.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/185-31d39b3d-9a0b-4e7b-a77d-39a6aa201ca5.txn b/.lancedb/nltk_chunking.lance/_transactions/185-31d39b3d-9a0b-4e7b-a77d-39a6aa201ca5.txn new file mode 100644 index 0000000000000000000000000000000000000000..b1bf88a15a5da60e55c460842c2659e22a03b33a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/185-31d39b3d-9a0b-4e7b-a77d-39a6aa201ca5.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/186-ad8fffb0-45a9-43bc-9739-ce7f5f3d6c46.txn b/.lancedb/nltk_chunking.lance/_transactions/186-ad8fffb0-45a9-43bc-9739-ce7f5f3d6c46.txn new file mode 100644 index 0000000000000000000000000000000000000000..f203a7f5197d79f9c4ea92ef29404d6b87d8d2e6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/186-ad8fffb0-45a9-43bc-9739-ce7f5f3d6c46.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/187-30803e3f-eedf-447a-acee-9788355ff97d.txn b/.lancedb/nltk_chunking.lance/_transactions/187-30803e3f-eedf-447a-acee-9788355ff97d.txn new file mode 100644 index 0000000000000000000000000000000000000000..ec7cc378e9b99565873c3dac22203800d73e386b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/187-30803e3f-eedf-447a-acee-9788355ff97d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/188-00730369-29db-430e-9d4c-533b9512e07c.txn b/.lancedb/nltk_chunking.lance/_transactions/188-00730369-29db-430e-9d4c-533b9512e07c.txn new file mode 100644 index 0000000000000000000000000000000000000000..7426ed2a1bdcad7f5bb4763e3ac8dda5550cae39 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/188-00730369-29db-430e-9d4c-533b9512e07c.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/189-da9c094c-8e7d-45fe-be87-125f339d57bc.txn b/.lancedb/nltk_chunking.lance/_transactions/189-da9c094c-8e7d-45fe-be87-125f339d57bc.txn new file mode 100644 index 0000000000000000000000000000000000000000..32a372c078039f083d07e730c0b468437308f41e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/189-da9c094c-8e7d-45fe-be87-125f339d57bc.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/19-16ee9714-c4e8-40f8-ba3b-247626d3dd3f.txn b/.lancedb/nltk_chunking.lance/_transactions/19-16ee9714-c4e8-40f8-ba3b-247626d3dd3f.txn new file mode 100644 index 0000000000000000000000000000000000000000..f5ad4de865bbec797e873e600cf40abd6b8960d7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/19-16ee9714-c4e8-40f8-ba3b-247626d3dd3f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/190-a234a054-f27f-4faa-922b-5f375b75655f.txn b/.lancedb/nltk_chunking.lance/_transactions/190-a234a054-f27f-4faa-922b-5f375b75655f.txn new file mode 100644 index 0000000000000000000000000000000000000000..eb17439445de377dffc56f9a5e05f915c0a16969 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/190-a234a054-f27f-4faa-922b-5f375b75655f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/191-a787b032-41c5-46c6-a161-813b623a4268.txn b/.lancedb/nltk_chunking.lance/_transactions/191-a787b032-41c5-46c6-a161-813b623a4268.txn new file mode 100644 index 0000000000000000000000000000000000000000..c95f1a62d1f15484ed183a3e7ec3b276996d067c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/191-a787b032-41c5-46c6-a161-813b623a4268.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/192-86d4e45e-93ad-47ca-9302-f4e4825f9959.txn b/.lancedb/nltk_chunking.lance/_transactions/192-86d4e45e-93ad-47ca-9302-f4e4825f9959.txn new file mode 100644 index 0000000000000000000000000000000000000000..487b33ca33d147dafbaa12ae5b11886af7b793e3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/192-86d4e45e-93ad-47ca-9302-f4e4825f9959.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/193-22070aa3-08d9-4d36-bc2d-efef9cbe8923.txn b/.lancedb/nltk_chunking.lance/_transactions/193-22070aa3-08d9-4d36-bc2d-efef9cbe8923.txn new file mode 100644 index 0000000000000000000000000000000000000000..11d85fbacee8b22c0888695bc4a1ffa9c9be8e52 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/193-22070aa3-08d9-4d36-bc2d-efef9cbe8923.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/194-aa2fceb4-b896-4ee0-a393-e372af66af0e.txn b/.lancedb/nltk_chunking.lance/_transactions/194-aa2fceb4-b896-4ee0-a393-e372af66af0e.txn new file mode 100644 index 0000000000000000000000000000000000000000..8f4cef8b316cf5a6f4fad988ca5204f70ea3f509 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/194-aa2fceb4-b896-4ee0-a393-e372af66af0e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/195-b18b4b62-8ded-4be7-a745-ee02f478ae22.txn b/.lancedb/nltk_chunking.lance/_transactions/195-b18b4b62-8ded-4be7-a745-ee02f478ae22.txn new file mode 100644 index 0000000000000000000000000000000000000000..414cd934bea3992f676b16a0987030dcef3adc4a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/195-b18b4b62-8ded-4be7-a745-ee02f478ae22.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/196-d704b76d-84ac-4f21-a731-fe9b68d472e5.txn b/.lancedb/nltk_chunking.lance/_transactions/196-d704b76d-84ac-4f21-a731-fe9b68d472e5.txn new file mode 100644 index 0000000000000000000000000000000000000000..dbb7b1ae7ab14f7c95c48af9b7eabb4ded2e00b9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/196-d704b76d-84ac-4f21-a731-fe9b68d472e5.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/197-f1b48aac-46cd-4edc-a0bc-d70bc79db3f2.txn b/.lancedb/nltk_chunking.lance/_transactions/197-f1b48aac-46cd-4edc-a0bc-d70bc79db3f2.txn new file mode 100644 index 0000000000000000000000000000000000000000..e24b6402fe0dabff6145a57e097415d840d9a2bd Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/197-f1b48aac-46cd-4edc-a0bc-d70bc79db3f2.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/198-8e369abe-c380-4727-93bc-e7ef7b0e6a05.txn b/.lancedb/nltk_chunking.lance/_transactions/198-8e369abe-c380-4727-93bc-e7ef7b0e6a05.txn new file mode 100644 index 0000000000000000000000000000000000000000..5834d826771c1a3c61346297507940dea79d8dad Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/198-8e369abe-c380-4727-93bc-e7ef7b0e6a05.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/199-2c1d6bf0-9610-4e09-8c25-ec701ec33039.txn b/.lancedb/nltk_chunking.lance/_transactions/199-2c1d6bf0-9610-4e09-8c25-ec701ec33039.txn new file mode 100644 index 0000000000000000000000000000000000000000..b9615d35fe845c334e3758ad077620601b9f1196 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/199-2c1d6bf0-9610-4e09-8c25-ec701ec33039.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/2-635fc80e-130d-4550-9e6f-8c7ffa0d3cc9.txn b/.lancedb/nltk_chunking.lance/_transactions/2-635fc80e-130d-4550-9e6f-8c7ffa0d3cc9.txn new file mode 100644 index 0000000000000000000000000000000000000000..2abda46232c99e9b611f3556c51ff7a6217ccdf4 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/_transactions/2-635fc80e-130d-4550-9e6f-8c7ffa0d3cc9.txn @@ -0,0 +1 @@ +$635fc80e-130d-4550-9e6f-8c7ffa0d3cc9²U2vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:38408text ÿÿÿÿÿÿÿÿÿ*string08 \ No newline at end of file diff --git a/.lancedb/nltk_chunking.lance/_transactions/20-1da01fd8-13ff-48e0-9f1b-e27f1907d1f8.txn b/.lancedb/nltk_chunking.lance/_transactions/20-1da01fd8-13ff-48e0-9f1b-e27f1907d1f8.txn new file mode 100644 index 0000000000000000000000000000000000000000..450f5233ea512af4854c3f1b0c127d0fcccd82f1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/20-1da01fd8-13ff-48e0-9f1b-e27f1907d1f8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/200-b65291df-be61-4970-b660-565f4ac75bee.txn b/.lancedb/nltk_chunking.lance/_transactions/200-b65291df-be61-4970-b660-565f4ac75bee.txn new file mode 100644 index 0000000000000000000000000000000000000000..cd505a66867b1ae26ae7e2387e7f8a374b6c15fb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/200-b65291df-be61-4970-b660-565f4ac75bee.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/201-c5264c0e-097e-4626-8a5f-ba7bafe8f795.txn b/.lancedb/nltk_chunking.lance/_transactions/201-c5264c0e-097e-4626-8a5f-ba7bafe8f795.txn new file mode 100644 index 0000000000000000000000000000000000000000..e3dedf7c5ca26ad91d49398275525c5a7c8471d7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/201-c5264c0e-097e-4626-8a5f-ba7bafe8f795.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/202-ce772edb-f6e5-4c86-b228-17e81c8f3cea.txn b/.lancedb/nltk_chunking.lance/_transactions/202-ce772edb-f6e5-4c86-b228-17e81c8f3cea.txn new file mode 100644 index 0000000000000000000000000000000000000000..deb68dde73928ecd8a78f2e069832815c2bacbc1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/202-ce772edb-f6e5-4c86-b228-17e81c8f3cea.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/203-41803827-f978-4b43-bf70-59dd9fbd891c.txn b/.lancedb/nltk_chunking.lance/_transactions/203-41803827-f978-4b43-bf70-59dd9fbd891c.txn new file mode 100644 index 0000000000000000000000000000000000000000..674ef9f3390432d94fed8607b78e1553f504350e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/203-41803827-f978-4b43-bf70-59dd9fbd891c.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/204-4e7b96a9-4815-4555-96d6-77337d081edf.txn b/.lancedb/nltk_chunking.lance/_transactions/204-4e7b96a9-4815-4555-96d6-77337d081edf.txn new file mode 100644 index 0000000000000000000000000000000000000000..e69ffb09e50ae8ba12e1ab2b232daf9cc8debbe8 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/204-4e7b96a9-4815-4555-96d6-77337d081edf.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/205-6c1519cc-3cb4-4f78-93d1-8acc2805ab19.txn b/.lancedb/nltk_chunking.lance/_transactions/205-6c1519cc-3cb4-4f78-93d1-8acc2805ab19.txn new file mode 100644 index 0000000000000000000000000000000000000000..bea3ea21c4477412798ad7a837a14c71c7670440 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/205-6c1519cc-3cb4-4f78-93d1-8acc2805ab19.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/206-353b5b29-0ba6-4326-9539-3f674f367646.txn b/.lancedb/nltk_chunking.lance/_transactions/206-353b5b29-0ba6-4326-9539-3f674f367646.txn new file mode 100644 index 0000000000000000000000000000000000000000..7aa87de712e87d65229ddcb7e3ca6fc6e88de1da Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/206-353b5b29-0ba6-4326-9539-3f674f367646.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/207-907f3262-cd88-4833-83de-7fc6125ac72e.txn b/.lancedb/nltk_chunking.lance/_transactions/207-907f3262-cd88-4833-83de-7fc6125ac72e.txn new file mode 100644 index 0000000000000000000000000000000000000000..af9a0546bf8ef92b7868b2a23cc030049c3c31d7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/207-907f3262-cd88-4833-83de-7fc6125ac72e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/208-ec61e93e-10d7-4de1-acde-c0f0823b7704.txn b/.lancedb/nltk_chunking.lance/_transactions/208-ec61e93e-10d7-4de1-acde-c0f0823b7704.txn new file mode 100644 index 0000000000000000000000000000000000000000..37b4fe8602ad341e659cb8ec71fd5607f79581ae Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/208-ec61e93e-10d7-4de1-acde-c0f0823b7704.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/209-75c8a08d-9b44-4087-9f1b-d001423548d4.txn b/.lancedb/nltk_chunking.lance/_transactions/209-75c8a08d-9b44-4087-9f1b-d001423548d4.txn new file mode 100644 index 0000000000000000000000000000000000000000..3b1b05644a41cef11dddb7304ff15c6abc57c0fa Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/209-75c8a08d-9b44-4087-9f1b-d001423548d4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/21-d01deb77-f81e-4713-8872-245ecc50217e.txn b/.lancedb/nltk_chunking.lance/_transactions/21-d01deb77-f81e-4713-8872-245ecc50217e.txn new file mode 100644 index 0000000000000000000000000000000000000000..3712206c1f23e36cfa0c14573f6188f11bc25f4f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/21-d01deb77-f81e-4713-8872-245ecc50217e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/210-19305d5a-8c74-4b15-8503-7e907eba4d23.txn b/.lancedb/nltk_chunking.lance/_transactions/210-19305d5a-8c74-4b15-8503-7e907eba4d23.txn new file mode 100644 index 0000000000000000000000000000000000000000..53304d3ce8b327ad53eb9b55787eda21791352e3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/210-19305d5a-8c74-4b15-8503-7e907eba4d23.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/211-a108da1a-3063-49cd-8c2e-dab974b50234.txn b/.lancedb/nltk_chunking.lance/_transactions/211-a108da1a-3063-49cd-8c2e-dab974b50234.txn new file mode 100644 index 0000000000000000000000000000000000000000..da93cb01c510d12c83cc27b8627f45d10e1c5394 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/211-a108da1a-3063-49cd-8c2e-dab974b50234.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/212-a8f70a00-4354-43de-bc99-fe868ab590db.txn b/.lancedb/nltk_chunking.lance/_transactions/212-a8f70a00-4354-43de-bc99-fe868ab590db.txn new file mode 100644 index 0000000000000000000000000000000000000000..ce164715a9d7b86cba30ab628d5703f5e090d88d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/212-a8f70a00-4354-43de-bc99-fe868ab590db.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/213-b2f5c04c-417f-4b2c-a67e-113aeb6c407b.txn b/.lancedb/nltk_chunking.lance/_transactions/213-b2f5c04c-417f-4b2c-a67e-113aeb6c407b.txn new file mode 100644 index 0000000000000000000000000000000000000000..379764b8eb97f996a8ff3f09f372e8aac899f5e9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/213-b2f5c04c-417f-4b2c-a67e-113aeb6c407b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/214-a0641a6b-5d10-47f2-8245-9d8f372b4a22.txn b/.lancedb/nltk_chunking.lance/_transactions/214-a0641a6b-5d10-47f2-8245-9d8f372b4a22.txn new file mode 100644 index 0000000000000000000000000000000000000000..fe71c484aa94421168a20a8abf321a87cdcd5fdc Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/214-a0641a6b-5d10-47f2-8245-9d8f372b4a22.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/215-2330f9dd-0810-4f25-a227-05cb9a229aa8.txn b/.lancedb/nltk_chunking.lance/_transactions/215-2330f9dd-0810-4f25-a227-05cb9a229aa8.txn new file mode 100644 index 0000000000000000000000000000000000000000..50db5a1f45bd77e2a6e262500bcb98cc79734e92 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/215-2330f9dd-0810-4f25-a227-05cb9a229aa8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/216-0cc19c94-a120-475c-9b30-ffefb6de3b37.txn b/.lancedb/nltk_chunking.lance/_transactions/216-0cc19c94-a120-475c-9b30-ffefb6de3b37.txn new file mode 100644 index 0000000000000000000000000000000000000000..c082e5a0f6c9cf74a6086922b4be2000b86b7ba6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/216-0cc19c94-a120-475c-9b30-ffefb6de3b37.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/217-c16de549-8c52-4c03-807d-de2e100801d4.txn b/.lancedb/nltk_chunking.lance/_transactions/217-c16de549-8c52-4c03-807d-de2e100801d4.txn new file mode 100644 index 0000000000000000000000000000000000000000..fa85b1a3e89e335d2bf4e1e9ae49d5dc15331919 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/217-c16de549-8c52-4c03-807d-de2e100801d4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/218-0883d710-370a-4859-96a8-d397256989d0.txn b/.lancedb/nltk_chunking.lance/_transactions/218-0883d710-370a-4859-96a8-d397256989d0.txn new file mode 100644 index 0000000000000000000000000000000000000000..dd703a75e2d8f06fd59ad93a05262cf8dbeeb4a3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/218-0883d710-370a-4859-96a8-d397256989d0.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/219-251b0011-6c94-4d40-817a-dc530f929c22.txn b/.lancedb/nltk_chunking.lance/_transactions/219-251b0011-6c94-4d40-817a-dc530f929c22.txn new file mode 100644 index 0000000000000000000000000000000000000000..1a7bd3ab1ac8de5543b96ed14f16d565e9e511a2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/219-251b0011-6c94-4d40-817a-dc530f929c22.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/22-99152cfe-7ce5-4030-9758-2ca169251eb7.txn b/.lancedb/nltk_chunking.lance/_transactions/22-99152cfe-7ce5-4030-9758-2ca169251eb7.txn new file mode 100644 index 0000000000000000000000000000000000000000..e82b4e0d9c45dfe395aa439d4a15b6bedb52064a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/22-99152cfe-7ce5-4030-9758-2ca169251eb7.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/220-80fbfc82-c50e-496e-ba8e-d00aabbb8bf3.txn b/.lancedb/nltk_chunking.lance/_transactions/220-80fbfc82-c50e-496e-ba8e-d00aabbb8bf3.txn new file mode 100644 index 0000000000000000000000000000000000000000..5906f31b99bdd4ac13833cf1b307d7590195e851 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/220-80fbfc82-c50e-496e-ba8e-d00aabbb8bf3.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/221-73a64a31-ef34-4686-8aee-18bd6c5ad101.txn b/.lancedb/nltk_chunking.lance/_transactions/221-73a64a31-ef34-4686-8aee-18bd6c5ad101.txn new file mode 100644 index 0000000000000000000000000000000000000000..cd6823d2e48dc6702a068bc7fc01338d0c4919cd Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/221-73a64a31-ef34-4686-8aee-18bd6c5ad101.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/222-54657934-d9d4-4694-b922-1b2d956e6652.txn b/.lancedb/nltk_chunking.lance/_transactions/222-54657934-d9d4-4694-b922-1b2d956e6652.txn new file mode 100644 index 0000000000000000000000000000000000000000..8efc5b5c60356e1e4e34ce78e7792ce6e1ccf811 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/222-54657934-d9d4-4694-b922-1b2d956e6652.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/223-f1ea771b-7f23-43b4-8abf-673d221d1b09.txn b/.lancedb/nltk_chunking.lance/_transactions/223-f1ea771b-7f23-43b4-8abf-673d221d1b09.txn new file mode 100644 index 0000000000000000000000000000000000000000..2af50b0d690d5583617eafc6b21eea6f5d91b995 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/223-f1ea771b-7f23-43b4-8abf-673d221d1b09.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/224-2de2f41b-8ff8-4676-8795-73d514a789e5.txn b/.lancedb/nltk_chunking.lance/_transactions/224-2de2f41b-8ff8-4676-8795-73d514a789e5.txn new file mode 100644 index 0000000000000000000000000000000000000000..01481d5f2d9e63272156ffdd8e86c3b46bcd367f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/224-2de2f41b-8ff8-4676-8795-73d514a789e5.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/225-9c511b25-4c2c-4469-989f-c1ad7a70ce1d.txn b/.lancedb/nltk_chunking.lance/_transactions/225-9c511b25-4c2c-4469-989f-c1ad7a70ce1d.txn new file mode 100644 index 0000000000000000000000000000000000000000..a43110e54d2b184dbd99e3f0de08e95f67783386 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/225-9c511b25-4c2c-4469-989f-c1ad7a70ce1d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/226-f12b9b08-7b9f-4166-8c52-793b5d6b51ea.txn b/.lancedb/nltk_chunking.lance/_transactions/226-f12b9b08-7b9f-4166-8c52-793b5d6b51ea.txn new file mode 100644 index 0000000000000000000000000000000000000000..ff1ded0130b966f6a048af639528f05f453cd46d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/226-f12b9b08-7b9f-4166-8c52-793b5d6b51ea.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/227-8d567431-9631-49a0-894a-1adf45b31e8a.txn b/.lancedb/nltk_chunking.lance/_transactions/227-8d567431-9631-49a0-894a-1adf45b31e8a.txn new file mode 100644 index 0000000000000000000000000000000000000000..d8e403a03f2cb62b4e0dc44adc1c26c26416bdc9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/227-8d567431-9631-49a0-894a-1adf45b31e8a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/228-d92b58b4-ae36-4b5d-93e4-2c757588fd05.txn b/.lancedb/nltk_chunking.lance/_transactions/228-d92b58b4-ae36-4b5d-93e4-2c757588fd05.txn new file mode 100644 index 0000000000000000000000000000000000000000..baa9077c53f70f139eeb0533a423c79f52440723 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/228-d92b58b4-ae36-4b5d-93e4-2c757588fd05.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/229-0c7a3968-ea21-492d-92d2-77cfb807b045.txn b/.lancedb/nltk_chunking.lance/_transactions/229-0c7a3968-ea21-492d-92d2-77cfb807b045.txn new file mode 100644 index 0000000000000000000000000000000000000000..453521d66d74d56739b1907b60bc410f78270ccb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/229-0c7a3968-ea21-492d-92d2-77cfb807b045.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/23-78061305-086b-4e03-a999-b1ba46b0d21c.txn b/.lancedb/nltk_chunking.lance/_transactions/23-78061305-086b-4e03-a999-b1ba46b0d21c.txn new file mode 100644 index 0000000000000000000000000000000000000000..e08c8aff762cae6b131c75f3c7d9c3b22fb150cc Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/23-78061305-086b-4e03-a999-b1ba46b0d21c.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/230-5be985af-0632-4204-9ffd-b2b7a49d20b2.txn b/.lancedb/nltk_chunking.lance/_transactions/230-5be985af-0632-4204-9ffd-b2b7a49d20b2.txn new file mode 100644 index 0000000000000000000000000000000000000000..15143c2c30d96be75b54d9f4a6c6be7891d99de9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/230-5be985af-0632-4204-9ffd-b2b7a49d20b2.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/231-1979b65a-0252-4a7f-95fe-49fff4e52ce1.txn b/.lancedb/nltk_chunking.lance/_transactions/231-1979b65a-0252-4a7f-95fe-49fff4e52ce1.txn new file mode 100644 index 0000000000000000000000000000000000000000..faa3715aed5e86b04c33e0c5cc6a6a197f3cb303 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/231-1979b65a-0252-4a7f-95fe-49fff4e52ce1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/232-6ba1b149-4cf8-4d3f-9917-2071607c9dd6.txn b/.lancedb/nltk_chunking.lance/_transactions/232-6ba1b149-4cf8-4d3f-9917-2071607c9dd6.txn new file mode 100644 index 0000000000000000000000000000000000000000..532e2fa5f72b31d25956f62ffa9f957017cadbdf Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/232-6ba1b149-4cf8-4d3f-9917-2071607c9dd6.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/233-ef76d8ed-8070-4d07-976d-94354cb30c00.txn b/.lancedb/nltk_chunking.lance/_transactions/233-ef76d8ed-8070-4d07-976d-94354cb30c00.txn new file mode 100644 index 0000000000000000000000000000000000000000..c994e32fcb36fc50dae61f479def911acadcbce4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/233-ef76d8ed-8070-4d07-976d-94354cb30c00.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/234-9fdabe85-edaf-4377-a656-cf3a7d162a45.txn b/.lancedb/nltk_chunking.lance/_transactions/234-9fdabe85-edaf-4377-a656-cf3a7d162a45.txn new file mode 100644 index 0000000000000000000000000000000000000000..d12d8a5b9bbba4fa9334a098235a3ebd46f04112 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/234-9fdabe85-edaf-4377-a656-cf3a7d162a45.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/235-36d27c01-7611-4587-84db-bf0cca9af45c.txn b/.lancedb/nltk_chunking.lance/_transactions/235-36d27c01-7611-4587-84db-bf0cca9af45c.txn new file mode 100644 index 0000000000000000000000000000000000000000..44d6798a460c6d8702e69943c60414857b0525d1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/235-36d27c01-7611-4587-84db-bf0cca9af45c.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/236-16552c79-5fe4-4dc2-9f1d-492dc3dac2e6.txn b/.lancedb/nltk_chunking.lance/_transactions/236-16552c79-5fe4-4dc2-9f1d-492dc3dac2e6.txn new file mode 100644 index 0000000000000000000000000000000000000000..9993faaf92ba66f68d8d37276e36ee059d6123a5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/236-16552c79-5fe4-4dc2-9f1d-492dc3dac2e6.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/237-c81b2a5f-d264-4571-a75c-4c1e90692ccf.txn b/.lancedb/nltk_chunking.lance/_transactions/237-c81b2a5f-d264-4571-a75c-4c1e90692ccf.txn new file mode 100644 index 0000000000000000000000000000000000000000..fd2a6e604afac5de1d03fc0d5ef2914b76650fc4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/237-c81b2a5f-d264-4571-a75c-4c1e90692ccf.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/238-6707ef9c-9034-4c61-8514-f83558b15ae6.txn b/.lancedb/nltk_chunking.lance/_transactions/238-6707ef9c-9034-4c61-8514-f83558b15ae6.txn new file mode 100644 index 0000000000000000000000000000000000000000..2d06c65e9f07cc041c9e36d96986945bbbff9881 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/238-6707ef9c-9034-4c61-8514-f83558b15ae6.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/239-d9c5d57a-a927-4eff-9e3d-301c8b987f10.txn b/.lancedb/nltk_chunking.lance/_transactions/239-d9c5d57a-a927-4eff-9e3d-301c8b987f10.txn new file mode 100644 index 0000000000000000000000000000000000000000..4d71eac10388f4d026fa56c5a69e2c1cb09923f3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/239-d9c5d57a-a927-4eff-9e3d-301c8b987f10.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/24-0158837c-52a8-4ffa-a1ab-fa3dd3636606.txn b/.lancedb/nltk_chunking.lance/_transactions/24-0158837c-52a8-4ffa-a1ab-fa3dd3636606.txn new file mode 100644 index 0000000000000000000000000000000000000000..f29c47e03defb148f59b5b919f335fd1b8bdd0e3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/24-0158837c-52a8-4ffa-a1ab-fa3dd3636606.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/240-03870638-0d1c-46ed-a33d-bc27f21ee740.txn b/.lancedb/nltk_chunking.lance/_transactions/240-03870638-0d1c-46ed-a33d-bc27f21ee740.txn new file mode 100644 index 0000000000000000000000000000000000000000..ff2fe8bc712899a4b43704acbf51463f1fae7de5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/240-03870638-0d1c-46ed-a33d-bc27f21ee740.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/241-231980f5-167c-408d-8205-c76acd1cefc3.txn b/.lancedb/nltk_chunking.lance/_transactions/241-231980f5-167c-408d-8205-c76acd1cefc3.txn new file mode 100644 index 0000000000000000000000000000000000000000..d0c4196fc2e458bcdadf31d2e5b5f2c5a2edfbef Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/241-231980f5-167c-408d-8205-c76acd1cefc3.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/242-a3d93680-e5c9-458a-ac42-1e4bc872c838.txn b/.lancedb/nltk_chunking.lance/_transactions/242-a3d93680-e5c9-458a-ac42-1e4bc872c838.txn new file mode 100644 index 0000000000000000000000000000000000000000..68c6022383eed9f76cf7c4bc3743c808091adb10 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/242-a3d93680-e5c9-458a-ac42-1e4bc872c838.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/243-08d1f8fb-270b-4860-b7a6-6c253da0d2a1.txn b/.lancedb/nltk_chunking.lance/_transactions/243-08d1f8fb-270b-4860-b7a6-6c253da0d2a1.txn new file mode 100644 index 0000000000000000000000000000000000000000..485835f9073b180e624f29369017f161876af3bf Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/243-08d1f8fb-270b-4860-b7a6-6c253da0d2a1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/244-95d67d3e-b11b-450a-9169-b6648634201f.txn b/.lancedb/nltk_chunking.lance/_transactions/244-95d67d3e-b11b-450a-9169-b6648634201f.txn new file mode 100644 index 0000000000000000000000000000000000000000..5d507ed8336e567c6cc952a43371b22f5126fa91 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/244-95d67d3e-b11b-450a-9169-b6648634201f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/245-0471b369-9151-443c-b7c3-46fdc85f8b5a.txn b/.lancedb/nltk_chunking.lance/_transactions/245-0471b369-9151-443c-b7c3-46fdc85f8b5a.txn new file mode 100644 index 0000000000000000000000000000000000000000..9c28700c9a245058097e807da5e8daae56fc88c6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/245-0471b369-9151-443c-b7c3-46fdc85f8b5a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/246-296b36b4-69a1-4c5e-8a97-880f5ba504a7.txn b/.lancedb/nltk_chunking.lance/_transactions/246-296b36b4-69a1-4c5e-8a97-880f5ba504a7.txn new file mode 100644 index 0000000000000000000000000000000000000000..6400d69c742beb89034256feb66b2573f31a479d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/246-296b36b4-69a1-4c5e-8a97-880f5ba504a7.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/247-d1d4aab9-7117-4f5e-ac5e-f0b0b1ad67bc.txn b/.lancedb/nltk_chunking.lance/_transactions/247-d1d4aab9-7117-4f5e-ac5e-f0b0b1ad67bc.txn new file mode 100644 index 0000000000000000000000000000000000000000..5b66f0051561d2047f27844aaf38c28d08c5ff63 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/247-d1d4aab9-7117-4f5e-ac5e-f0b0b1ad67bc.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/248-a594d041-5d0d-44d8-8b05-e8013da73864.txn b/.lancedb/nltk_chunking.lance/_transactions/248-a594d041-5d0d-44d8-8b05-e8013da73864.txn new file mode 100644 index 0000000000000000000000000000000000000000..bb2d483bdc2e326a1d5c3cc8b42891495c0eab27 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/248-a594d041-5d0d-44d8-8b05-e8013da73864.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/249-22da11fb-1e7f-4273-87b6-9cfd9e254ca4.txn b/.lancedb/nltk_chunking.lance/_transactions/249-22da11fb-1e7f-4273-87b6-9cfd9e254ca4.txn new file mode 100644 index 0000000000000000000000000000000000000000..ed1767f60142fad69cc272a481c21decce3e574a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/249-22da11fb-1e7f-4273-87b6-9cfd9e254ca4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/25-7b71a77d-61fd-4b70-b57f-57fc973f62e4.txn b/.lancedb/nltk_chunking.lance/_transactions/25-7b71a77d-61fd-4b70-b57f-57fc973f62e4.txn new file mode 100644 index 0000000000000000000000000000000000000000..ff436b6289d0e6c2882920dd7751ee77ca7f1f4c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/25-7b71a77d-61fd-4b70-b57f-57fc973f62e4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/250-70a8dff4-d8df-485f-99de-113571ddecf3.txn b/.lancedb/nltk_chunking.lance/_transactions/250-70a8dff4-d8df-485f-99de-113571ddecf3.txn new file mode 100644 index 0000000000000000000000000000000000000000..64b02c0096828b8114f93c91155371b02b2db729 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/250-70a8dff4-d8df-485f-99de-113571ddecf3.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/251-d1984aa8-99cb-4f49-8819-d7d2e6753a7b.txn b/.lancedb/nltk_chunking.lance/_transactions/251-d1984aa8-99cb-4f49-8819-d7d2e6753a7b.txn new file mode 100644 index 0000000000000000000000000000000000000000..f7af41145ff6e0be213aa06ff4fdf5a0ddcf2263 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/251-d1984aa8-99cb-4f49-8819-d7d2e6753a7b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/252-9ed13e8c-9953-4e40-b434-55467fd2dada.txn b/.lancedb/nltk_chunking.lance/_transactions/252-9ed13e8c-9953-4e40-b434-55467fd2dada.txn new file mode 100644 index 0000000000000000000000000000000000000000..53471c8205a96f02ab4c6f97bc2dd785367c35e3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/252-9ed13e8c-9953-4e40-b434-55467fd2dada.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/253-a5b1e03c-4c60-433c-b69c-815d54e8dacc.txn b/.lancedb/nltk_chunking.lance/_transactions/253-a5b1e03c-4c60-433c-b69c-815d54e8dacc.txn new file mode 100644 index 0000000000000000000000000000000000000000..03ed90ee614f84069ecdeef1a1294fba748272ea Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/253-a5b1e03c-4c60-433c-b69c-815d54e8dacc.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/254-0e5ad2ca-7627-4f49-9967-cc8bfe3d3db9.txn b/.lancedb/nltk_chunking.lance/_transactions/254-0e5ad2ca-7627-4f49-9967-cc8bfe3d3db9.txn new file mode 100644 index 0000000000000000000000000000000000000000..3a1de7447b20a7f836e7b8e264a5201d72c9c120 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/254-0e5ad2ca-7627-4f49-9967-cc8bfe3d3db9.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/255-d8076372-99db-463d-b87b-b4230016bca2.txn b/.lancedb/nltk_chunking.lance/_transactions/255-d8076372-99db-463d-b87b-b4230016bca2.txn new file mode 100644 index 0000000000000000000000000000000000000000..f0fb4ebbe2c75d2b4d202193209d63ca8cafa722 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/255-d8076372-99db-463d-b87b-b4230016bca2.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/256-8034ee8c-7e15-4075-8421-75e45a2cc162.txn b/.lancedb/nltk_chunking.lance/_transactions/256-8034ee8c-7e15-4075-8421-75e45a2cc162.txn new file mode 100644 index 0000000000000000000000000000000000000000..c438de2c9d4cedc5d0dbe914e4fddb4baaa871b9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/256-8034ee8c-7e15-4075-8421-75e45a2cc162.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/257-2c3d6963-6575-419d-9f54-a7941707c971.txn b/.lancedb/nltk_chunking.lance/_transactions/257-2c3d6963-6575-419d-9f54-a7941707c971.txn new file mode 100644 index 0000000000000000000000000000000000000000..d3e1cdfc2dd7d226f02ce81b40fbcc5879f90cbb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/257-2c3d6963-6575-419d-9f54-a7941707c971.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/258-98c7f791-7499-4721-9854-bbb6fa2ce4c4.txn b/.lancedb/nltk_chunking.lance/_transactions/258-98c7f791-7499-4721-9854-bbb6fa2ce4c4.txn new file mode 100644 index 0000000000000000000000000000000000000000..14a040d580798da3491fb3fa3ffacf42ee01e915 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/258-98c7f791-7499-4721-9854-bbb6fa2ce4c4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/259-a88b2976-f1cb-45ff-b22a-c61483baddd5.txn b/.lancedb/nltk_chunking.lance/_transactions/259-a88b2976-f1cb-45ff-b22a-c61483baddd5.txn new file mode 100644 index 0000000000000000000000000000000000000000..6db7e1f06341ae0c9b316d052fd1bf98789b559f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/259-a88b2976-f1cb-45ff-b22a-c61483baddd5.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/26-719d1911-b21e-4952-a7fa-0732a38136c1.txn b/.lancedb/nltk_chunking.lance/_transactions/26-719d1911-b21e-4952-a7fa-0732a38136c1.txn new file mode 100644 index 0000000000000000000000000000000000000000..31ca03ec977fdde1f1256b2aa1b82dcf6b08139a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/26-719d1911-b21e-4952-a7fa-0732a38136c1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/260-79bbf052-6b22-4da7-aad1-2cc13a74e4da.txn b/.lancedb/nltk_chunking.lance/_transactions/260-79bbf052-6b22-4da7-aad1-2cc13a74e4da.txn new file mode 100644 index 0000000000000000000000000000000000000000..60aed5b06484fe04660a93afb13b9db7b08ac538 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/260-79bbf052-6b22-4da7-aad1-2cc13a74e4da.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/261-5691962b-f61f-4ec7-9104-201c736faad7.txn b/.lancedb/nltk_chunking.lance/_transactions/261-5691962b-f61f-4ec7-9104-201c736faad7.txn new file mode 100644 index 0000000000000000000000000000000000000000..f7a484bf60dc2b95f33dd68356156b5ec066fe92 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/261-5691962b-f61f-4ec7-9104-201c736faad7.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/262-447bc9ab-a2fd-4cca-af9d-8f8c2c5a2766.txn b/.lancedb/nltk_chunking.lance/_transactions/262-447bc9ab-a2fd-4cca-af9d-8f8c2c5a2766.txn new file mode 100644 index 0000000000000000000000000000000000000000..d9ade120934a98e1c4a9a7fb525199e1ae4db4ea Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/262-447bc9ab-a2fd-4cca-af9d-8f8c2c5a2766.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/263-dacec87c-ffc9-4a0c-bc00-3d2f4a5a1e07.txn b/.lancedb/nltk_chunking.lance/_transactions/263-dacec87c-ffc9-4a0c-bc00-3d2f4a5a1e07.txn new file mode 100644 index 0000000000000000000000000000000000000000..ccc9d58ea9ec9489a12e90b0333644fd5215ab33 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/263-dacec87c-ffc9-4a0c-bc00-3d2f4a5a1e07.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/264-1cffa10c-704c-48ac-8435-0ff0707a15f0.txn b/.lancedb/nltk_chunking.lance/_transactions/264-1cffa10c-704c-48ac-8435-0ff0707a15f0.txn new file mode 100644 index 0000000000000000000000000000000000000000..b7e2c7ee61e0eb24457fde48d3dfbd431281b3f3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/264-1cffa10c-704c-48ac-8435-0ff0707a15f0.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/265-b562c35e-417c-417c-9e12-c49c6fdfbf77.txn b/.lancedb/nltk_chunking.lance/_transactions/265-b562c35e-417c-417c-9e12-c49c6fdfbf77.txn new file mode 100644 index 0000000000000000000000000000000000000000..d9eb9db976f83b4170d0afa28da115491a80a4cd Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/265-b562c35e-417c-417c-9e12-c49c6fdfbf77.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/266-8058148d-9a3e-4a9b-ac97-88385885feb6.txn b/.lancedb/nltk_chunking.lance/_transactions/266-8058148d-9a3e-4a9b-ac97-88385885feb6.txn new file mode 100644 index 0000000000000000000000000000000000000000..db39393f3d680971d1172f694e87fff7f11d857e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/266-8058148d-9a3e-4a9b-ac97-88385885feb6.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/267-734c38fe-85a0-4613-879f-c57acdd2df19.txn b/.lancedb/nltk_chunking.lance/_transactions/267-734c38fe-85a0-4613-879f-c57acdd2df19.txn new file mode 100644 index 0000000000000000000000000000000000000000..26267bf9af638ce0cccdacac57bb6b6ace6799fc Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/267-734c38fe-85a0-4613-879f-c57acdd2df19.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/268-c487edb0-88dc-4590-88cf-0d64a8dffa06.txn b/.lancedb/nltk_chunking.lance/_transactions/268-c487edb0-88dc-4590-88cf-0d64a8dffa06.txn new file mode 100644 index 0000000000000000000000000000000000000000..369adf15e170218a8c937c6b7f3dbbe37cf28a24 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/268-c487edb0-88dc-4590-88cf-0d64a8dffa06.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/269-ee4eed04-8583-462b-9fdb-f78130ecc190.txn b/.lancedb/nltk_chunking.lance/_transactions/269-ee4eed04-8583-462b-9fdb-f78130ecc190.txn new file mode 100644 index 0000000000000000000000000000000000000000..62d00e057fea94a60708c0cee796bd69d3379c8c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/269-ee4eed04-8583-462b-9fdb-f78130ecc190.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/27-4f750cc3-6234-4616-be82-b16d0d7d2096.txn b/.lancedb/nltk_chunking.lance/_transactions/27-4f750cc3-6234-4616-be82-b16d0d7d2096.txn new file mode 100644 index 0000000000000000000000000000000000000000..0df373a55f9f520855bfe4181295e0e88bb74660 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/27-4f750cc3-6234-4616-be82-b16d0d7d2096.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/270-aca9b02c-788f-4994-8159-88c9ba4dfcd8.txn b/.lancedb/nltk_chunking.lance/_transactions/270-aca9b02c-788f-4994-8159-88c9ba4dfcd8.txn new file mode 100644 index 0000000000000000000000000000000000000000..d4185b9c00b58e9b710a219300fb004b4c0c5a2d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/270-aca9b02c-788f-4994-8159-88c9ba4dfcd8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/271-1cb58abe-9148-4231-86ff-9f0704f7ac33.txn b/.lancedb/nltk_chunking.lance/_transactions/271-1cb58abe-9148-4231-86ff-9f0704f7ac33.txn new file mode 100644 index 0000000000000000000000000000000000000000..86cba91a0a38603a8927ecea406259caf8496f89 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/271-1cb58abe-9148-4231-86ff-9f0704f7ac33.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/272-6b78b3a1-149e-4a4d-a30b-6c610ca9ae13.txn b/.lancedb/nltk_chunking.lance/_transactions/272-6b78b3a1-149e-4a4d-a30b-6c610ca9ae13.txn new file mode 100644 index 0000000000000000000000000000000000000000..2109b67d0258c67dfaed41000c31f99b3a65136e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/272-6b78b3a1-149e-4a4d-a30b-6c610ca9ae13.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/273-8171adff-efca-4681-b4d1-9f2c61a5e3a7.txn b/.lancedb/nltk_chunking.lance/_transactions/273-8171adff-efca-4681-b4d1-9f2c61a5e3a7.txn new file mode 100644 index 0000000000000000000000000000000000000000..5a266648b6e86ee52c33066fd5d194789cd04c93 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/273-8171adff-efca-4681-b4d1-9f2c61a5e3a7.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/274-41ca7379-b189-4447-b7c6-ab580ee3909f.txn b/.lancedb/nltk_chunking.lance/_transactions/274-41ca7379-b189-4447-b7c6-ab580ee3909f.txn new file mode 100644 index 0000000000000000000000000000000000000000..dfb2c0de91915cb9aa620e0244168447c8853a3d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/274-41ca7379-b189-4447-b7c6-ab580ee3909f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/275-ad303f59-0551-4073-9c8a-755c89497618.txn b/.lancedb/nltk_chunking.lance/_transactions/275-ad303f59-0551-4073-9c8a-755c89497618.txn new file mode 100644 index 0000000000000000000000000000000000000000..de7ef505837d2b3c06db08a154f1fb2dd8b375bc Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/275-ad303f59-0551-4073-9c8a-755c89497618.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/276-2df7f307-eec5-417f-b74f-49396803b7b7.txn b/.lancedb/nltk_chunking.lance/_transactions/276-2df7f307-eec5-417f-b74f-49396803b7b7.txn new file mode 100644 index 0000000000000000000000000000000000000000..45e77083faeb3a57c493591ab33a694f306d6f66 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/276-2df7f307-eec5-417f-b74f-49396803b7b7.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/277-0b5af28b-6e10-4149-80d0-27b49fbe9076.txn b/.lancedb/nltk_chunking.lance/_transactions/277-0b5af28b-6e10-4149-80d0-27b49fbe9076.txn new file mode 100644 index 0000000000000000000000000000000000000000..7756168306afe58235a9309adb77c17696724201 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/277-0b5af28b-6e10-4149-80d0-27b49fbe9076.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/278-77f424a0-12da-4bfd-b4cc-cef63760b100.txn b/.lancedb/nltk_chunking.lance/_transactions/278-77f424a0-12da-4bfd-b4cc-cef63760b100.txn new file mode 100644 index 0000000000000000000000000000000000000000..9d9faf079633bfaa79bb28ce9ca4e3191b917054 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/278-77f424a0-12da-4bfd-b4cc-cef63760b100.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/279-49b2413a-816b-410a-a911-6651a4e3dea0.txn b/.lancedb/nltk_chunking.lance/_transactions/279-49b2413a-816b-410a-a911-6651a4e3dea0.txn new file mode 100644 index 0000000000000000000000000000000000000000..8e21c6191d604686cbd4dff09567146d9a8af1b3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/279-49b2413a-816b-410a-a911-6651a4e3dea0.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/28-ae74fe84-1b52-4c4e-a46f-a8ca675bcfd4.txn b/.lancedb/nltk_chunking.lance/_transactions/28-ae74fe84-1b52-4c4e-a46f-a8ca675bcfd4.txn new file mode 100644 index 0000000000000000000000000000000000000000..d12a0bcb5c5c1fb3e3ea1496bc5dea012f3fc30e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/28-ae74fe84-1b52-4c4e-a46f-a8ca675bcfd4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/280-9e4ce192-e502-4b45-9b26-5ac9643ac94e.txn b/.lancedb/nltk_chunking.lance/_transactions/280-9e4ce192-e502-4b45-9b26-5ac9643ac94e.txn new file mode 100644 index 0000000000000000000000000000000000000000..f98a4a6f5578803c43c29a02f8101238de9c4a8f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/280-9e4ce192-e502-4b45-9b26-5ac9643ac94e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/281-6d059f1c-aba6-46e4-8097-d5df77bd94bc.txn b/.lancedb/nltk_chunking.lance/_transactions/281-6d059f1c-aba6-46e4-8097-d5df77bd94bc.txn new file mode 100644 index 0000000000000000000000000000000000000000..78459952c52769d7a7fd8773698639822ae81d3c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/281-6d059f1c-aba6-46e4-8097-d5df77bd94bc.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/282-25ff28a4-7f92-487a-be4b-690f862fe830.txn b/.lancedb/nltk_chunking.lance/_transactions/282-25ff28a4-7f92-487a-be4b-690f862fe830.txn new file mode 100644 index 0000000000000000000000000000000000000000..26d32b2622ccb137fe50e29d99142718a651cedb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/282-25ff28a4-7f92-487a-be4b-690f862fe830.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/283-12a01624-7a8b-4100-bef8-975f2ebc6729.txn b/.lancedb/nltk_chunking.lance/_transactions/283-12a01624-7a8b-4100-bef8-975f2ebc6729.txn new file mode 100644 index 0000000000000000000000000000000000000000..3b6f3c60eca85065f29f91a1f0decea4180d108c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/283-12a01624-7a8b-4100-bef8-975f2ebc6729.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/284-aa2b0e5e-7fdc-41bc-8444-b98ee82c5315.txn b/.lancedb/nltk_chunking.lance/_transactions/284-aa2b0e5e-7fdc-41bc-8444-b98ee82c5315.txn new file mode 100644 index 0000000000000000000000000000000000000000..12a40f61c4d1e26f4587a97e99add69ed27c1e12 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/284-aa2b0e5e-7fdc-41bc-8444-b98ee82c5315.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/285-a44d73f7-e830-4522-8b53-dc14efedb6fb.txn b/.lancedb/nltk_chunking.lance/_transactions/285-a44d73f7-e830-4522-8b53-dc14efedb6fb.txn new file mode 100644 index 0000000000000000000000000000000000000000..7e75faa1c9cf3e378b5185d7be0e626ecc544be6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/285-a44d73f7-e830-4522-8b53-dc14efedb6fb.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/286-70f940a1-2f87-46e4-9781-e1831a704c59.txn b/.lancedb/nltk_chunking.lance/_transactions/286-70f940a1-2f87-46e4-9781-e1831a704c59.txn new file mode 100644 index 0000000000000000000000000000000000000000..b10027380724675ed2210999c2cd31186e3142d4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/286-70f940a1-2f87-46e4-9781-e1831a704c59.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/287-a38c37b0-7459-4da3-b459-e9238a414945.txn b/.lancedb/nltk_chunking.lance/_transactions/287-a38c37b0-7459-4da3-b459-e9238a414945.txn new file mode 100644 index 0000000000000000000000000000000000000000..ca40f4351618ba6d0ed64c71c9b4359660e0351a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/287-a38c37b0-7459-4da3-b459-e9238a414945.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/288-14688ce0-0776-49b6-8c12-d1d371b11cfd.txn b/.lancedb/nltk_chunking.lance/_transactions/288-14688ce0-0776-49b6-8c12-d1d371b11cfd.txn new file mode 100644 index 0000000000000000000000000000000000000000..bfb193bfcab12c1d58eafb201dcaf2cf4ae7543a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/288-14688ce0-0776-49b6-8c12-d1d371b11cfd.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/289-f2ec187d-b584-4685-b55c-d564ad454437.txn b/.lancedb/nltk_chunking.lance/_transactions/289-f2ec187d-b584-4685-b55c-d564ad454437.txn new file mode 100644 index 0000000000000000000000000000000000000000..80840fee3accb837ea7fa658f1840fc6e16553be Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/289-f2ec187d-b584-4685-b55c-d564ad454437.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/29-65458ade-e89c-42b7-bbc6-3a734b4cb716.txn b/.lancedb/nltk_chunking.lance/_transactions/29-65458ade-e89c-42b7-bbc6-3a734b4cb716.txn new file mode 100644 index 0000000000000000000000000000000000000000..7006ca0b74b6c8c5bb92cfa701d8a9ba64212882 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/29-65458ade-e89c-42b7-bbc6-3a734b4cb716.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/290-7d651ac6-4027-4450-827a-740206468582.txn b/.lancedb/nltk_chunking.lance/_transactions/290-7d651ac6-4027-4450-827a-740206468582.txn new file mode 100644 index 0000000000000000000000000000000000000000..a037117a131a3f77529e331ecc42730f68620be9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/290-7d651ac6-4027-4450-827a-740206468582.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/291-6863daa4-4203-428c-9375-c7bd6dd20724.txn b/.lancedb/nltk_chunking.lance/_transactions/291-6863daa4-4203-428c-9375-c7bd6dd20724.txn new file mode 100644 index 0000000000000000000000000000000000000000..be1414ce177a47f86cafd30f055a05dcd32b19b1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/291-6863daa4-4203-428c-9375-c7bd6dd20724.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/292-20135ae9-f081-422f-88b9-dbe668977f07.txn b/.lancedb/nltk_chunking.lance/_transactions/292-20135ae9-f081-422f-88b9-dbe668977f07.txn new file mode 100644 index 0000000000000000000000000000000000000000..9defc6840a569fc4cdda52c8de251ff35b551159 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/292-20135ae9-f081-422f-88b9-dbe668977f07.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/293-a6b94374-0960-4e3c-9677-4fdfa9e46159.txn b/.lancedb/nltk_chunking.lance/_transactions/293-a6b94374-0960-4e3c-9677-4fdfa9e46159.txn new file mode 100644 index 0000000000000000000000000000000000000000..32809049f81242d557e5be9e749eb3964cb545e3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/293-a6b94374-0960-4e3c-9677-4fdfa9e46159.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/294-99451ee9-7cb6-4b29-a217-c48a880e6cda.txn b/.lancedb/nltk_chunking.lance/_transactions/294-99451ee9-7cb6-4b29-a217-c48a880e6cda.txn new file mode 100644 index 0000000000000000000000000000000000000000..c07e588423aa696083a20afe2a015093d23b3bdb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/294-99451ee9-7cb6-4b29-a217-c48a880e6cda.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/295-e870d74f-7fc5-4ab6-a394-80981cacb6e8.txn b/.lancedb/nltk_chunking.lance/_transactions/295-e870d74f-7fc5-4ab6-a394-80981cacb6e8.txn new file mode 100644 index 0000000000000000000000000000000000000000..ab052865c1879354a78d66845ae5aad815a23813 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/295-e870d74f-7fc5-4ab6-a394-80981cacb6e8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/296-e7e68e1d-f29c-4edd-9b8d-85108f2a2fc5.txn b/.lancedb/nltk_chunking.lance/_transactions/296-e7e68e1d-f29c-4edd-9b8d-85108f2a2fc5.txn new file mode 100644 index 0000000000000000000000000000000000000000..5a51460dad99bdadc8a3495fccf2437693e91e67 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/296-e7e68e1d-f29c-4edd-9b8d-85108f2a2fc5.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/297-52a75cd8-6448-4481-bded-4deefb1db680.txn b/.lancedb/nltk_chunking.lance/_transactions/297-52a75cd8-6448-4481-bded-4deefb1db680.txn new file mode 100644 index 0000000000000000000000000000000000000000..c90c1427aca0416f139f33a815d81092e4da3216 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/297-52a75cd8-6448-4481-bded-4deefb1db680.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/298-e1dbfcbe-902d-4b26-b977-4956e233bf59.txn b/.lancedb/nltk_chunking.lance/_transactions/298-e1dbfcbe-902d-4b26-b977-4956e233bf59.txn new file mode 100644 index 0000000000000000000000000000000000000000..eec59f279e5a2b7c377b27377fb743b513ac0ec8 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/298-e1dbfcbe-902d-4b26-b977-4956e233bf59.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/299-5f1f34a4-a3f1-4de7-a5f9-6270fb9fdf1e.txn b/.lancedb/nltk_chunking.lance/_transactions/299-5f1f34a4-a3f1-4de7-a5f9-6270fb9fdf1e.txn new file mode 100644 index 0000000000000000000000000000000000000000..d93a79139af2e80fe83d2a47735ba929a0de2ae0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/299-5f1f34a4-a3f1-4de7-a5f9-6270fb9fdf1e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/3-e87cf862-f78e-41e2-acb3-9bd4e8ab83c6.txn b/.lancedb/nltk_chunking.lance/_transactions/3-e87cf862-f78e-41e2-acb3-9bd4e8ab83c6.txn new file mode 100644 index 0000000000000000000000000000000000000000..1ec383a3d3e3f2d28289d91d72cc3edc9fff329f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/_transactions/3-e87cf862-f78e-41e2-acb3-9bd4e8ab83c6.txn @@ -0,0 +1 @@ +$e87cf862-f78e-41e2-acb3-9bd4e8ab83c6²U2vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:38408text ÿÿÿÿÿÿÿÿÿ*string08 \ No newline at end of file diff --git a/.lancedb/nltk_chunking.lance/_transactions/30-f0913db4-74d1-49f2-9ebb-40ed25eb0f69.txn b/.lancedb/nltk_chunking.lance/_transactions/30-f0913db4-74d1-49f2-9ebb-40ed25eb0f69.txn new file mode 100644 index 0000000000000000000000000000000000000000..58716912e3593f306ddb7c2c58ae678bf0129612 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/30-f0913db4-74d1-49f2-9ebb-40ed25eb0f69.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/300-b8e51ad2-fd42-4336-925b-a079fa850d2d.txn b/.lancedb/nltk_chunking.lance/_transactions/300-b8e51ad2-fd42-4336-925b-a079fa850d2d.txn new file mode 100644 index 0000000000000000000000000000000000000000..b2c35fd735c7c8074945373150f44c40fc158f8a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/300-b8e51ad2-fd42-4336-925b-a079fa850d2d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/301-41d8efba-8739-4e32-b487-96b9f6bda16f.txn b/.lancedb/nltk_chunking.lance/_transactions/301-41d8efba-8739-4e32-b487-96b9f6bda16f.txn new file mode 100644 index 0000000000000000000000000000000000000000..37f8a26a74f6c1aa35794bf082b91b33c79e6f2f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/301-41d8efba-8739-4e32-b487-96b9f6bda16f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/302-8d57a8eb-72d6-4e3c-965e-0f5cd24a4081.txn b/.lancedb/nltk_chunking.lance/_transactions/302-8d57a8eb-72d6-4e3c-965e-0f5cd24a4081.txn new file mode 100644 index 0000000000000000000000000000000000000000..c204ff718963bb225d71f3798e87640ae439cfe0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/302-8d57a8eb-72d6-4e3c-965e-0f5cd24a4081.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/303-6c09d5d5-bde2-46b7-b439-b2d332438ac5.txn b/.lancedb/nltk_chunking.lance/_transactions/303-6c09d5d5-bde2-46b7-b439-b2d332438ac5.txn new file mode 100644 index 0000000000000000000000000000000000000000..b0ff593587551b9d924bfa05fdfb401d5f264886 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/303-6c09d5d5-bde2-46b7-b439-b2d332438ac5.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/304-3bbad497-033d-4440-ad79-53834857d9a1.txn b/.lancedb/nltk_chunking.lance/_transactions/304-3bbad497-033d-4440-ad79-53834857d9a1.txn new file mode 100644 index 0000000000000000000000000000000000000000..0975ae543cacb25e73bd264b7b1772c946f63c8d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/304-3bbad497-033d-4440-ad79-53834857d9a1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/305-ab94e4d9-3861-436b-8377-c727936e56ee.txn b/.lancedb/nltk_chunking.lance/_transactions/305-ab94e4d9-3861-436b-8377-c727936e56ee.txn new file mode 100644 index 0000000000000000000000000000000000000000..6a0491619e3c3123a48e8d6d95a06a83ddda1794 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/305-ab94e4d9-3861-436b-8377-c727936e56ee.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/306-dda1d385-bff1-4a8e-b618-13a4db5213eb.txn b/.lancedb/nltk_chunking.lance/_transactions/306-dda1d385-bff1-4a8e-b618-13a4db5213eb.txn new file mode 100644 index 0000000000000000000000000000000000000000..fc22b7f1089ff1a20187adedcd21c8c77c030f5d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/306-dda1d385-bff1-4a8e-b618-13a4db5213eb.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/307-c8a6770f-0e48-4b3b-9f15-2cbf13a8fd6f.txn b/.lancedb/nltk_chunking.lance/_transactions/307-c8a6770f-0e48-4b3b-9f15-2cbf13a8fd6f.txn new file mode 100644 index 0000000000000000000000000000000000000000..eae22455b21cdf1d2f5d1253fe9e98a10fa4d93d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/307-c8a6770f-0e48-4b3b-9f15-2cbf13a8fd6f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/308-9afcf5ad-4576-4548-aaed-300d35938f28.txn b/.lancedb/nltk_chunking.lance/_transactions/308-9afcf5ad-4576-4548-aaed-300d35938f28.txn new file mode 100644 index 0000000000000000000000000000000000000000..61a123c2e5f6d22efcebfd9cd64fb9a11a55a3bd Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/308-9afcf5ad-4576-4548-aaed-300d35938f28.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/309-ef799d68-207f-40a1-badd-7ce7ed9d60ea.txn b/.lancedb/nltk_chunking.lance/_transactions/309-ef799d68-207f-40a1-badd-7ce7ed9d60ea.txn new file mode 100644 index 0000000000000000000000000000000000000000..5e50640d65585cd2996b681aa959de370b3e5bda Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/309-ef799d68-207f-40a1-badd-7ce7ed9d60ea.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/31-beeffbbd-4a7f-4b86-8703-2e50d6ff4812.txn b/.lancedb/nltk_chunking.lance/_transactions/31-beeffbbd-4a7f-4b86-8703-2e50d6ff4812.txn new file mode 100644 index 0000000000000000000000000000000000000000..087093172b2aced69867ac3ba46b8dd5f3e6e9af Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/31-beeffbbd-4a7f-4b86-8703-2e50d6ff4812.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/310-7e4ca39e-ccff-46bb-8245-e0e8b251e312.txn b/.lancedb/nltk_chunking.lance/_transactions/310-7e4ca39e-ccff-46bb-8245-e0e8b251e312.txn new file mode 100644 index 0000000000000000000000000000000000000000..88e83402c74e8cc213174c34beef9c418c166355 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/310-7e4ca39e-ccff-46bb-8245-e0e8b251e312.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/311-3bd16cd6-80fb-4afa-ba4f-4c37d9cb4798.txn b/.lancedb/nltk_chunking.lance/_transactions/311-3bd16cd6-80fb-4afa-ba4f-4c37d9cb4798.txn new file mode 100644 index 0000000000000000000000000000000000000000..827fdb1a9ce4fec9cf9a420c3548620fbb0da6f9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/311-3bd16cd6-80fb-4afa-ba4f-4c37d9cb4798.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/312-fb9c6469-7ede-47d8-8f96-56b8cbfad4f8.txn b/.lancedb/nltk_chunking.lance/_transactions/312-fb9c6469-7ede-47d8-8f96-56b8cbfad4f8.txn new file mode 100644 index 0000000000000000000000000000000000000000..b0624e799db73a5d2c152c113cbd9dcd506d5180 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/312-fb9c6469-7ede-47d8-8f96-56b8cbfad4f8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/313-8555acc0-d7d9-4b0b-94a5-a777f06129a4.txn b/.lancedb/nltk_chunking.lance/_transactions/313-8555acc0-d7d9-4b0b-94a5-a777f06129a4.txn new file mode 100644 index 0000000000000000000000000000000000000000..9a98378e2b085036497545e547e370e4e506ea77 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/313-8555acc0-d7d9-4b0b-94a5-a777f06129a4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/314-0efee8cf-7d5a-42d7-9da6-38032d4e1a86.txn b/.lancedb/nltk_chunking.lance/_transactions/314-0efee8cf-7d5a-42d7-9da6-38032d4e1a86.txn new file mode 100644 index 0000000000000000000000000000000000000000..0bdd688d93a01cd5531cf6f5589a659dc8b1d08b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/314-0efee8cf-7d5a-42d7-9da6-38032d4e1a86.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/315-a5bce214-5401-4a03-9a9f-1a58d4d316cf.txn b/.lancedb/nltk_chunking.lance/_transactions/315-a5bce214-5401-4a03-9a9f-1a58d4d316cf.txn new file mode 100644 index 0000000000000000000000000000000000000000..c0bcffb14e30e86ac4636cc17428295a656e9bed Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/315-a5bce214-5401-4a03-9a9f-1a58d4d316cf.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/316-2b75cbe2-9dba-4f59-872f-d4cd59fe8742.txn b/.lancedb/nltk_chunking.lance/_transactions/316-2b75cbe2-9dba-4f59-872f-d4cd59fe8742.txn new file mode 100644 index 0000000000000000000000000000000000000000..65e9abb5419259d276ad014f7146cad9cbef699e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/316-2b75cbe2-9dba-4f59-872f-d4cd59fe8742.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/317-44f46e77-8c2e-469d-9333-68aa244fddf8.txn b/.lancedb/nltk_chunking.lance/_transactions/317-44f46e77-8c2e-469d-9333-68aa244fddf8.txn new file mode 100644 index 0000000000000000000000000000000000000000..10fa54e7d4d08e7bc093c8053179e004dbde7c48 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/317-44f46e77-8c2e-469d-9333-68aa244fddf8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/318-b4b54d4f-67c9-4259-81a7-6f6d9fdffbbd.txn b/.lancedb/nltk_chunking.lance/_transactions/318-b4b54d4f-67c9-4259-81a7-6f6d9fdffbbd.txn new file mode 100644 index 0000000000000000000000000000000000000000..8091841556831ce5e8f2e729867d7ae19525bf22 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/318-b4b54d4f-67c9-4259-81a7-6f6d9fdffbbd.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/319-0407dd76-4be4-4e7d-a484-8bd8ef16ba12.txn b/.lancedb/nltk_chunking.lance/_transactions/319-0407dd76-4be4-4e7d-a484-8bd8ef16ba12.txn new file mode 100644 index 0000000000000000000000000000000000000000..921179994150c62f2a187f64a7a04f13d888eadf Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/319-0407dd76-4be4-4e7d-a484-8bd8ef16ba12.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/32-a6cb2994-19e3-4abe-ab96-883c48cfdc80.txn b/.lancedb/nltk_chunking.lance/_transactions/32-a6cb2994-19e3-4abe-ab96-883c48cfdc80.txn new file mode 100644 index 0000000000000000000000000000000000000000..81bc5f4b5f61cc0d1a5f4a43979585320855a677 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/32-a6cb2994-19e3-4abe-ab96-883c48cfdc80.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/320-b7400f10-a40f-47e0-a279-39aeb973f3bb.txn b/.lancedb/nltk_chunking.lance/_transactions/320-b7400f10-a40f-47e0-a279-39aeb973f3bb.txn new file mode 100644 index 0000000000000000000000000000000000000000..98f7dfe20a54f95529e61f600ea1f9984c033847 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/320-b7400f10-a40f-47e0-a279-39aeb973f3bb.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/321-9fc3e17e-c22d-4cc3-a5d0-6d4ce5ab98f3.txn b/.lancedb/nltk_chunking.lance/_transactions/321-9fc3e17e-c22d-4cc3-a5d0-6d4ce5ab98f3.txn new file mode 100644 index 0000000000000000000000000000000000000000..a0bcea39eeeb65356ed6e2b04c6f2a562b6e27ab Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/321-9fc3e17e-c22d-4cc3-a5d0-6d4ce5ab98f3.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/322-716d7836-ea75-46f3-9b42-d8a1d27d528a.txn b/.lancedb/nltk_chunking.lance/_transactions/322-716d7836-ea75-46f3-9b42-d8a1d27d528a.txn new file mode 100644 index 0000000000000000000000000000000000000000..75729cd9e143f0fecca48ad0666de4e2c9e81690 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/322-716d7836-ea75-46f3-9b42-d8a1d27d528a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/323-ad95dceb-1995-4d10-b9f2-adacb1ced9bc.txn b/.lancedb/nltk_chunking.lance/_transactions/323-ad95dceb-1995-4d10-b9f2-adacb1ced9bc.txn new file mode 100644 index 0000000000000000000000000000000000000000..495454b238e6c6ef41d06ee78e724600a064f85c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/323-ad95dceb-1995-4d10-b9f2-adacb1ced9bc.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/324-1aca772c-bf0c-4b2a-a5b3-0481beadd2ba.txn b/.lancedb/nltk_chunking.lance/_transactions/324-1aca772c-bf0c-4b2a-a5b3-0481beadd2ba.txn new file mode 100644 index 0000000000000000000000000000000000000000..abd71fa799bbb6f57d8f0d71046b97aba5ac913f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/324-1aca772c-bf0c-4b2a-a5b3-0481beadd2ba.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/325-484d1a7a-6dea-410a-b662-c27834404ce2.txn b/.lancedb/nltk_chunking.lance/_transactions/325-484d1a7a-6dea-410a-b662-c27834404ce2.txn new file mode 100644 index 0000000000000000000000000000000000000000..b3175e9248e579c2dba2ea9541af9a7f18837d7e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/325-484d1a7a-6dea-410a-b662-c27834404ce2.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/326-b35bfbad-ab38-4009-9f8f-c4af0423962b.txn b/.lancedb/nltk_chunking.lance/_transactions/326-b35bfbad-ab38-4009-9f8f-c4af0423962b.txn new file mode 100644 index 0000000000000000000000000000000000000000..3b82c26db0fde2c1cbe82385292527b6fefda4db Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/326-b35bfbad-ab38-4009-9f8f-c4af0423962b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/327-a7984f93-09cb-4450-ab20-24e96693aa56.txn b/.lancedb/nltk_chunking.lance/_transactions/327-a7984f93-09cb-4450-ab20-24e96693aa56.txn new file mode 100644 index 0000000000000000000000000000000000000000..8434b7ea4eb02c1ead625e11af574c8e5fad0925 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/327-a7984f93-09cb-4450-ab20-24e96693aa56.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/328-8e80de4e-4541-4298-93bd-c5cedf19c61f.txn b/.lancedb/nltk_chunking.lance/_transactions/328-8e80de4e-4541-4298-93bd-c5cedf19c61f.txn new file mode 100644 index 0000000000000000000000000000000000000000..7a310b3e84f7a16fdea251f1e0000fcf3bb96eba Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/328-8e80de4e-4541-4298-93bd-c5cedf19c61f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/329-251aa128-4d1c-4a7f-aaed-557366550609.txn b/.lancedb/nltk_chunking.lance/_transactions/329-251aa128-4d1c-4a7f-aaed-557366550609.txn new file mode 100644 index 0000000000000000000000000000000000000000..5028926d79f827359f79311d8b6fec5af395f45a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/329-251aa128-4d1c-4a7f-aaed-557366550609.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/33-659719c3-264d-4082-9ca0-ebd3b9eb91ea.txn b/.lancedb/nltk_chunking.lance/_transactions/33-659719c3-264d-4082-9ca0-ebd3b9eb91ea.txn new file mode 100644 index 0000000000000000000000000000000000000000..664a322ef2bffa9c348ba2d7c27ca66aae8cf05a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/33-659719c3-264d-4082-9ca0-ebd3b9eb91ea.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/330-7129f008-9984-47fc-9f92-0f92b8f3e704.txn b/.lancedb/nltk_chunking.lance/_transactions/330-7129f008-9984-47fc-9f92-0f92b8f3e704.txn new file mode 100644 index 0000000000000000000000000000000000000000..b806d3f14f052b48867d7be33ca0af2d46299feb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/330-7129f008-9984-47fc-9f92-0f92b8f3e704.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/331-292ec50c-b17b-4c18-9b14-192993d8a4ef.txn b/.lancedb/nltk_chunking.lance/_transactions/331-292ec50c-b17b-4c18-9b14-192993d8a4ef.txn new file mode 100644 index 0000000000000000000000000000000000000000..57a219eed268b4c16da246413a54e3498cb3b712 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/331-292ec50c-b17b-4c18-9b14-192993d8a4ef.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/332-402b4628-4e6c-4c02-83aa-923ffd45f408.txn b/.lancedb/nltk_chunking.lance/_transactions/332-402b4628-4e6c-4c02-83aa-923ffd45f408.txn new file mode 100644 index 0000000000000000000000000000000000000000..7a70c13d73d281e8cddaa5d63e3252ff03357d06 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/332-402b4628-4e6c-4c02-83aa-923ffd45f408.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/333-5856a766-97e4-487f-aa25-103b400caf23.txn b/.lancedb/nltk_chunking.lance/_transactions/333-5856a766-97e4-487f-aa25-103b400caf23.txn new file mode 100644 index 0000000000000000000000000000000000000000..8a47849fadec6d4e5aca4dfcf447918264f5cf70 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/333-5856a766-97e4-487f-aa25-103b400caf23.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/334-7f9bd89f-a61b-4f36-a3d4-ddc09f88810f.txn b/.lancedb/nltk_chunking.lance/_transactions/334-7f9bd89f-a61b-4f36-a3d4-ddc09f88810f.txn new file mode 100644 index 0000000000000000000000000000000000000000..141e036924cf3abf5a8081915fa4e0d6534b7efc Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/334-7f9bd89f-a61b-4f36-a3d4-ddc09f88810f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/335-a86f7f64-9005-4709-bdbb-94c4780c6c03.txn b/.lancedb/nltk_chunking.lance/_transactions/335-a86f7f64-9005-4709-bdbb-94c4780c6c03.txn new file mode 100644 index 0000000000000000000000000000000000000000..44cd945ec2004d4a8903b6f7719bce5a31ea75c1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/335-a86f7f64-9005-4709-bdbb-94c4780c6c03.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/336-d8e7bb85-3f61-476e-892b-43984b611e05.txn b/.lancedb/nltk_chunking.lance/_transactions/336-d8e7bb85-3f61-476e-892b-43984b611e05.txn new file mode 100644 index 0000000000000000000000000000000000000000..211eba24e4667240fc9b602360f05aa57ec66425 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/336-d8e7bb85-3f61-476e-892b-43984b611e05.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/337-b314b2cf-0c7d-4a2c-9619-18947ed89764.txn b/.lancedb/nltk_chunking.lance/_transactions/337-b314b2cf-0c7d-4a2c-9619-18947ed89764.txn new file mode 100644 index 0000000000000000000000000000000000000000..b5b1e063b0f20dd16bf07f4db0d5ae654c989035 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/337-b314b2cf-0c7d-4a2c-9619-18947ed89764.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/338-8db68bc5-eae1-46e0-9639-f78be016f887.txn b/.lancedb/nltk_chunking.lance/_transactions/338-8db68bc5-eae1-46e0-9639-f78be016f887.txn new file mode 100644 index 0000000000000000000000000000000000000000..658eb6560853e8267c22ead812b3a24fbc2b0170 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/338-8db68bc5-eae1-46e0-9639-f78be016f887.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/339-a1bcee57-6e01-42bc-a7b9-87a3e62b4a39.txn b/.lancedb/nltk_chunking.lance/_transactions/339-a1bcee57-6e01-42bc-a7b9-87a3e62b4a39.txn new file mode 100644 index 0000000000000000000000000000000000000000..cd404e2a87dd87fde0ed11c4f17173c3f9f9af4b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/339-a1bcee57-6e01-42bc-a7b9-87a3e62b4a39.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/34-ab70914a-fbec-44b2-a5a5-8e2caa244a35.txn b/.lancedb/nltk_chunking.lance/_transactions/34-ab70914a-fbec-44b2-a5a5-8e2caa244a35.txn new file mode 100644 index 0000000000000000000000000000000000000000..c4ed08d10af7f1e68e95ddec1594cdbd3605bf5c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/34-ab70914a-fbec-44b2-a5a5-8e2caa244a35.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/340-48aed382-5ce7-4939-ab6d-a28fb5dfdeaf.txn b/.lancedb/nltk_chunking.lance/_transactions/340-48aed382-5ce7-4939-ab6d-a28fb5dfdeaf.txn new file mode 100644 index 0000000000000000000000000000000000000000..1c58637b8a56107dde5fdc507d9675cb57b4b42a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/340-48aed382-5ce7-4939-ab6d-a28fb5dfdeaf.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/341-f9de1dd0-4a80-45dc-bde2-2ff3f0d88139.txn b/.lancedb/nltk_chunking.lance/_transactions/341-f9de1dd0-4a80-45dc-bde2-2ff3f0d88139.txn new file mode 100644 index 0000000000000000000000000000000000000000..49adc981a001005c3bb0512dca419935a0863b6a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/341-f9de1dd0-4a80-45dc-bde2-2ff3f0d88139.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/342-9b635b25-db84-4236-ae59-14ce0e501688.txn b/.lancedb/nltk_chunking.lance/_transactions/342-9b635b25-db84-4236-ae59-14ce0e501688.txn new file mode 100644 index 0000000000000000000000000000000000000000..a510a435f83cfd2fd8cd4782a4d9a5e951b1bc1a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/342-9b635b25-db84-4236-ae59-14ce0e501688.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/343-c45df2d9-68ae-420f-a9b2-7dddce0daa8d.txn b/.lancedb/nltk_chunking.lance/_transactions/343-c45df2d9-68ae-420f-a9b2-7dddce0daa8d.txn new file mode 100644 index 0000000000000000000000000000000000000000..80f74df86a8df60bc6c8a1b00dd7b52fc653693f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/343-c45df2d9-68ae-420f-a9b2-7dddce0daa8d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/344-0c454ceb-0108-476a-82e4-112c8be65704.txn b/.lancedb/nltk_chunking.lance/_transactions/344-0c454ceb-0108-476a-82e4-112c8be65704.txn new file mode 100644 index 0000000000000000000000000000000000000000..63852d725afd234257d4451cd17a04c800fce810 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/344-0c454ceb-0108-476a-82e4-112c8be65704.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/345-335597ae-a069-4561-ab28-9b9eb5598445.txn b/.lancedb/nltk_chunking.lance/_transactions/345-335597ae-a069-4561-ab28-9b9eb5598445.txn new file mode 100644 index 0000000000000000000000000000000000000000..0049c3f538553738ddd56870b986f414b492d0e2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/345-335597ae-a069-4561-ab28-9b9eb5598445.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/346-d88b619e-5ff3-4260-9f97-72b38521d9e9.txn b/.lancedb/nltk_chunking.lance/_transactions/346-d88b619e-5ff3-4260-9f97-72b38521d9e9.txn new file mode 100644 index 0000000000000000000000000000000000000000..2fbf21de8f7b8d91a7b7c8a38b1242cfe29857f1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/346-d88b619e-5ff3-4260-9f97-72b38521d9e9.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/347-602f320a-b822-4acd-bc5d-31848ea54bcd.txn b/.lancedb/nltk_chunking.lance/_transactions/347-602f320a-b822-4acd-bc5d-31848ea54bcd.txn new file mode 100644 index 0000000000000000000000000000000000000000..483e4dc622222981b823924dc9e80f2f74c2f2f9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/347-602f320a-b822-4acd-bc5d-31848ea54bcd.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/348-8300bcc6-4f97-4062-9cc6-532f08fca840.txn b/.lancedb/nltk_chunking.lance/_transactions/348-8300bcc6-4f97-4062-9cc6-532f08fca840.txn new file mode 100644 index 0000000000000000000000000000000000000000..702d0ed7b79c2044fa7645398dcc40aa9c7bf215 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/348-8300bcc6-4f97-4062-9cc6-532f08fca840.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/349-3ab688b0-184c-4cb2-9a91-bd6482a31199.txn b/.lancedb/nltk_chunking.lance/_transactions/349-3ab688b0-184c-4cb2-9a91-bd6482a31199.txn new file mode 100644 index 0000000000000000000000000000000000000000..7b22a137d08c475c15a6bd304c5880bea7722e33 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/349-3ab688b0-184c-4cb2-9a91-bd6482a31199.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/35-5a441104-8f9c-42f3-acee-93f1e09acaa5.txn b/.lancedb/nltk_chunking.lance/_transactions/35-5a441104-8f9c-42f3-acee-93f1e09acaa5.txn new file mode 100644 index 0000000000000000000000000000000000000000..558cd199058c8463662c0129c9975c9d6da89d96 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/35-5a441104-8f9c-42f3-acee-93f1e09acaa5.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/350-3f1c3e42-688c-4dc2-8245-e2773dc1555b.txn b/.lancedb/nltk_chunking.lance/_transactions/350-3f1c3e42-688c-4dc2-8245-e2773dc1555b.txn new file mode 100644 index 0000000000000000000000000000000000000000..45ce1c9cd5114389b52c989056202c1c91d04baf Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/350-3f1c3e42-688c-4dc2-8245-e2773dc1555b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/351-d8fbaed5-b45b-454e-9ca5-3fdb8bf042e8.txn b/.lancedb/nltk_chunking.lance/_transactions/351-d8fbaed5-b45b-454e-9ca5-3fdb8bf042e8.txn new file mode 100644 index 0000000000000000000000000000000000000000..aedf4105d732a20f1ecac2ab616eaff3785d1f13 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/351-d8fbaed5-b45b-454e-9ca5-3fdb8bf042e8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/352-e4114368-e7eb-4003-9bb8-814892993e63.txn b/.lancedb/nltk_chunking.lance/_transactions/352-e4114368-e7eb-4003-9bb8-814892993e63.txn new file mode 100644 index 0000000000000000000000000000000000000000..d81fded819f002b8158c2f4c1fafa5a7c1d75599 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/352-e4114368-e7eb-4003-9bb8-814892993e63.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/353-79326682-a747-4599-bb08-4f915cb887f4.txn b/.lancedb/nltk_chunking.lance/_transactions/353-79326682-a747-4599-bb08-4f915cb887f4.txn new file mode 100644 index 0000000000000000000000000000000000000000..ff65d09f3e3deba0d68c6f74a897b534f8367fde Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/353-79326682-a747-4599-bb08-4f915cb887f4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/354-4e5efd6f-6e55-43d6-8b93-494c5e425101.txn b/.lancedb/nltk_chunking.lance/_transactions/354-4e5efd6f-6e55-43d6-8b93-494c5e425101.txn new file mode 100644 index 0000000000000000000000000000000000000000..b5081f8dd826b5898ab57e623c0995f26e0e5a66 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/354-4e5efd6f-6e55-43d6-8b93-494c5e425101.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/355-98e281bb-d37f-4fec-b5b0-b95812b37a50.txn b/.lancedb/nltk_chunking.lance/_transactions/355-98e281bb-d37f-4fec-b5b0-b95812b37a50.txn new file mode 100644 index 0000000000000000000000000000000000000000..dfb1f1feee2befe8ef3689435ea90758efe1d106 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/355-98e281bb-d37f-4fec-b5b0-b95812b37a50.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/356-b04ef749-3b07-4663-8631-4c066e8cad23.txn b/.lancedb/nltk_chunking.lance/_transactions/356-b04ef749-3b07-4663-8631-4c066e8cad23.txn new file mode 100644 index 0000000000000000000000000000000000000000..b734a69bc2e7793118269b7629be48fa6b0c0c1c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/356-b04ef749-3b07-4663-8631-4c066e8cad23.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/357-9fe03f7d-095a-47d6-a7ca-7b13aed2070f.txn b/.lancedb/nltk_chunking.lance/_transactions/357-9fe03f7d-095a-47d6-a7ca-7b13aed2070f.txn new file mode 100644 index 0000000000000000000000000000000000000000..a1e606d0df469679702ef61a0f3b2e6eb3a6193c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/357-9fe03f7d-095a-47d6-a7ca-7b13aed2070f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/358-c937f005-63d0-4828-aac3-0c28b982e0a9.txn b/.lancedb/nltk_chunking.lance/_transactions/358-c937f005-63d0-4828-aac3-0c28b982e0a9.txn new file mode 100644 index 0000000000000000000000000000000000000000..e598b36dac242ced5b66f43effe61ef93db636b2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/358-c937f005-63d0-4828-aac3-0c28b982e0a9.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/359-f92a60e5-111b-4c9b-afad-51550c00e2cf.txn b/.lancedb/nltk_chunking.lance/_transactions/359-f92a60e5-111b-4c9b-afad-51550c00e2cf.txn new file mode 100644 index 0000000000000000000000000000000000000000..038016c9ff157591981c0582bbd7bba8d308ddac Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/359-f92a60e5-111b-4c9b-afad-51550c00e2cf.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/36-a8b6786f-3e01-4ab9-8112-a5e748c1a272.txn b/.lancedb/nltk_chunking.lance/_transactions/36-a8b6786f-3e01-4ab9-8112-a5e748c1a272.txn new file mode 100644 index 0000000000000000000000000000000000000000..d2d4c5276c7a97781ebd8b286cb54de30ff16dff Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/36-a8b6786f-3e01-4ab9-8112-a5e748c1a272.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/360-5b972416-d90a-4821-a9e1-685c5bd67ed2.txn b/.lancedb/nltk_chunking.lance/_transactions/360-5b972416-d90a-4821-a9e1-685c5bd67ed2.txn new file mode 100644 index 0000000000000000000000000000000000000000..3469cadff29d26d13d84054d8351e3b8dabacf39 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/360-5b972416-d90a-4821-a9e1-685c5bd67ed2.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/361-d8009a59-1a64-41ef-8527-45750158ce71.txn b/.lancedb/nltk_chunking.lance/_transactions/361-d8009a59-1a64-41ef-8527-45750158ce71.txn new file mode 100644 index 0000000000000000000000000000000000000000..bc893e00be266dfca9bdc2259a89b45df407d826 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/361-d8009a59-1a64-41ef-8527-45750158ce71.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/362-b4e26273-9181-4ddf-86d9-6fe60a392419.txn b/.lancedb/nltk_chunking.lance/_transactions/362-b4e26273-9181-4ddf-86d9-6fe60a392419.txn new file mode 100644 index 0000000000000000000000000000000000000000..6f8f6fa913ee0fb12be28655c8c0bd56922288bd Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/362-b4e26273-9181-4ddf-86d9-6fe60a392419.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/363-f74f2497-2949-48ae-88e8-7a724542c5f5.txn b/.lancedb/nltk_chunking.lance/_transactions/363-f74f2497-2949-48ae-88e8-7a724542c5f5.txn new file mode 100644 index 0000000000000000000000000000000000000000..f0299fe9db075f7390307e8bf7f315b0d6e95173 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/363-f74f2497-2949-48ae-88e8-7a724542c5f5.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/364-12a39396-2501-4895-9aa8-0098e5011223.txn b/.lancedb/nltk_chunking.lance/_transactions/364-12a39396-2501-4895-9aa8-0098e5011223.txn new file mode 100644 index 0000000000000000000000000000000000000000..07404770a59f8f5802500f912b237552489f9dd6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/364-12a39396-2501-4895-9aa8-0098e5011223.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/365-99d0c4fc-398f-4393-a841-157523cd2423.txn b/.lancedb/nltk_chunking.lance/_transactions/365-99d0c4fc-398f-4393-a841-157523cd2423.txn new file mode 100644 index 0000000000000000000000000000000000000000..4b1ebc88e2777a75f990af29f34eb3997baedc83 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/365-99d0c4fc-398f-4393-a841-157523cd2423.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/366-88688bdd-5b11-465a-9c77-22eb2cf235e2.txn b/.lancedb/nltk_chunking.lance/_transactions/366-88688bdd-5b11-465a-9c77-22eb2cf235e2.txn new file mode 100644 index 0000000000000000000000000000000000000000..696744b7da8ce0e100aa6c84ea903f47ab834e35 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/366-88688bdd-5b11-465a-9c77-22eb2cf235e2.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/367-afb1abff-5dfc-40f8-b6ba-000de78c4a6d.txn b/.lancedb/nltk_chunking.lance/_transactions/367-afb1abff-5dfc-40f8-b6ba-000de78c4a6d.txn new file mode 100644 index 0000000000000000000000000000000000000000..e527ca8e233f61f491a6b54dedea331bfac02d35 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/367-afb1abff-5dfc-40f8-b6ba-000de78c4a6d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/368-f16ff789-2d28-4233-9bee-1beecb25bca0.txn b/.lancedb/nltk_chunking.lance/_transactions/368-f16ff789-2d28-4233-9bee-1beecb25bca0.txn new file mode 100644 index 0000000000000000000000000000000000000000..337ce6a9fb10ce20b601832e17b1cfc2c1b214f7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/368-f16ff789-2d28-4233-9bee-1beecb25bca0.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/369-9e02b1f8-8cb7-42d8-8bff-ff50eee170c8.txn b/.lancedb/nltk_chunking.lance/_transactions/369-9e02b1f8-8cb7-42d8-8bff-ff50eee170c8.txn new file mode 100644 index 0000000000000000000000000000000000000000..b811c23b63193246a5b9d7afacc760a22a0a2753 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/369-9e02b1f8-8cb7-42d8-8bff-ff50eee170c8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/37-52acd308-15a7-4851-9e45-8e6898320905.txn b/.lancedb/nltk_chunking.lance/_transactions/37-52acd308-15a7-4851-9e45-8e6898320905.txn new file mode 100644 index 0000000000000000000000000000000000000000..c933a065b94a66ef3bafad8bc89f11401df7bc41 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/37-52acd308-15a7-4851-9e45-8e6898320905.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/370-2cd743a7-8da4-4c8a-af01-417b5f20cb53.txn b/.lancedb/nltk_chunking.lance/_transactions/370-2cd743a7-8da4-4c8a-af01-417b5f20cb53.txn new file mode 100644 index 0000000000000000000000000000000000000000..8b030b865c70f97f592b4f8786ae233c67be1c6f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/370-2cd743a7-8da4-4c8a-af01-417b5f20cb53.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/371-d067daca-e396-4e3a-9be2-e58c4ce0d22a.txn b/.lancedb/nltk_chunking.lance/_transactions/371-d067daca-e396-4e3a-9be2-e58c4ce0d22a.txn new file mode 100644 index 0000000000000000000000000000000000000000..e44a2aa1a5ddbeabad3b63fb533b523eec1f3c65 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/371-d067daca-e396-4e3a-9be2-e58c4ce0d22a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/372-a71b5f40-949c-4da3-80eb-6ced450d9713.txn b/.lancedb/nltk_chunking.lance/_transactions/372-a71b5f40-949c-4da3-80eb-6ced450d9713.txn new file mode 100644 index 0000000000000000000000000000000000000000..4cc5f5a459e5819a8dfce45ad901231a14d726f7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/372-a71b5f40-949c-4da3-80eb-6ced450d9713.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/373-901695ab-d2df-46fd-a7d8-e638e999e60e.txn b/.lancedb/nltk_chunking.lance/_transactions/373-901695ab-d2df-46fd-a7d8-e638e999e60e.txn new file mode 100644 index 0000000000000000000000000000000000000000..cf177999afc29aa89f67070d96f89a3c593b61c7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/373-901695ab-d2df-46fd-a7d8-e638e999e60e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/374-bad33ae4-7f97-44f0-9865-9cc73941d72d.txn b/.lancedb/nltk_chunking.lance/_transactions/374-bad33ae4-7f97-44f0-9865-9cc73941d72d.txn new file mode 100644 index 0000000000000000000000000000000000000000..9c95b6078f817a7c770d2ebe5f60c835d8c5ba37 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/374-bad33ae4-7f97-44f0-9865-9cc73941d72d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/375-4ff2f256-d7bf-4485-af37-33a0a707ead3.txn b/.lancedb/nltk_chunking.lance/_transactions/375-4ff2f256-d7bf-4485-af37-33a0a707ead3.txn new file mode 100644 index 0000000000000000000000000000000000000000..c1697fff0d81dd7f8902067c770f55d5880e7c77 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/375-4ff2f256-d7bf-4485-af37-33a0a707ead3.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/376-16d973d4-3b5c-45f8-82d7-733829c4162e.txn b/.lancedb/nltk_chunking.lance/_transactions/376-16d973d4-3b5c-45f8-82d7-733829c4162e.txn new file mode 100644 index 0000000000000000000000000000000000000000..6c8a70020ce3615f9c25001cf4aa88f458693a2c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/376-16d973d4-3b5c-45f8-82d7-733829c4162e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/377-24dc55db-21c1-43d7-96de-2645a63fe7b0.txn b/.lancedb/nltk_chunking.lance/_transactions/377-24dc55db-21c1-43d7-96de-2645a63fe7b0.txn new file mode 100644 index 0000000000000000000000000000000000000000..bf373f6b474b173875161c99b05740d769367d62 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/377-24dc55db-21c1-43d7-96de-2645a63fe7b0.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/378-4cd91d44-7c80-447c-a297-db50111f04f0.txn b/.lancedb/nltk_chunking.lance/_transactions/378-4cd91d44-7c80-447c-a297-db50111f04f0.txn new file mode 100644 index 0000000000000000000000000000000000000000..fed09df6f7101ec84f05ff838f94ed3b22519761 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/378-4cd91d44-7c80-447c-a297-db50111f04f0.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/379-4ce047ab-9d1a-4c61-a4da-ce97a6a804e9.txn b/.lancedb/nltk_chunking.lance/_transactions/379-4ce047ab-9d1a-4c61-a4da-ce97a6a804e9.txn new file mode 100644 index 0000000000000000000000000000000000000000..d578aab25ad0befc2c2231a6559db890046e27cb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/379-4ce047ab-9d1a-4c61-a4da-ce97a6a804e9.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/38-aec99a61-4109-437d-9b2d-9cdf85c7672e.txn b/.lancedb/nltk_chunking.lance/_transactions/38-aec99a61-4109-437d-9b2d-9cdf85c7672e.txn new file mode 100644 index 0000000000000000000000000000000000000000..099f7b21c58fe48edf2249bf5664994f52d81973 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/38-aec99a61-4109-437d-9b2d-9cdf85c7672e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/380-1a740262-eea1-454d-b54a-8270e5147d8a.txn b/.lancedb/nltk_chunking.lance/_transactions/380-1a740262-eea1-454d-b54a-8270e5147d8a.txn new file mode 100644 index 0000000000000000000000000000000000000000..5f45484e4087b0b693a16481c4fba525679fcbf5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/380-1a740262-eea1-454d-b54a-8270e5147d8a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/381-93288a78-b393-4ba9-ba31-129bd4472fc9.txn b/.lancedb/nltk_chunking.lance/_transactions/381-93288a78-b393-4ba9-ba31-129bd4472fc9.txn new file mode 100644 index 0000000000000000000000000000000000000000..95bd801e7dd4c2c3deba75dc9a3875682809edb0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/381-93288a78-b393-4ba9-ba31-129bd4472fc9.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/382-1e7a6016-0578-4144-a412-acfad75dcee9.txn b/.lancedb/nltk_chunking.lance/_transactions/382-1e7a6016-0578-4144-a412-acfad75dcee9.txn new file mode 100644 index 0000000000000000000000000000000000000000..ad659ec28b020d105cc8b0496a41681c791d975a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/382-1e7a6016-0578-4144-a412-acfad75dcee9.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/383-bf47d9fb-7391-41d8-89fb-ec4fdd90ffb4.txn b/.lancedb/nltk_chunking.lance/_transactions/383-bf47d9fb-7391-41d8-89fb-ec4fdd90ffb4.txn new file mode 100644 index 0000000000000000000000000000000000000000..877dcc0720a2bfd8b5e893824a920494fad46095 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/383-bf47d9fb-7391-41d8-89fb-ec4fdd90ffb4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/384-a385b778-8461-468e-ad5b-99d0a8a54c0f.txn b/.lancedb/nltk_chunking.lance/_transactions/384-a385b778-8461-468e-ad5b-99d0a8a54c0f.txn new file mode 100644 index 0000000000000000000000000000000000000000..d00d5424292e67d4a4b8e53d0406be40e66076a7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/384-a385b778-8461-468e-ad5b-99d0a8a54c0f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/385-d725a21a-63d8-4d14-9cf2-992371849fb9.txn b/.lancedb/nltk_chunking.lance/_transactions/385-d725a21a-63d8-4d14-9cf2-992371849fb9.txn new file mode 100644 index 0000000000000000000000000000000000000000..ef87a8dbfd2afae75983b798b81ae1cbb266d399 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/385-d725a21a-63d8-4d14-9cf2-992371849fb9.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/386-01c0b271-50bf-405e-b916-9aaa7e56d44f.txn b/.lancedb/nltk_chunking.lance/_transactions/386-01c0b271-50bf-405e-b916-9aaa7e56d44f.txn new file mode 100644 index 0000000000000000000000000000000000000000..0ff6bc2c5abbefdf47d02855e42d63292cc7a10e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/386-01c0b271-50bf-405e-b916-9aaa7e56d44f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/387-5c35f2d6-7ce6-43c0-948a-2e7d8a482d4d.txn b/.lancedb/nltk_chunking.lance/_transactions/387-5c35f2d6-7ce6-43c0-948a-2e7d8a482d4d.txn new file mode 100644 index 0000000000000000000000000000000000000000..410913aa89137a262094c5b66eee898c7d9924be Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/387-5c35f2d6-7ce6-43c0-948a-2e7d8a482d4d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/388-9b09d9cd-4edb-4ef9-b983-76d45389151b.txn b/.lancedb/nltk_chunking.lance/_transactions/388-9b09d9cd-4edb-4ef9-b983-76d45389151b.txn new file mode 100644 index 0000000000000000000000000000000000000000..edd55aa2a40fa7fdc3248cca5e52f8174405720a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/388-9b09d9cd-4edb-4ef9-b983-76d45389151b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/389-ae6c7233-999d-41f4-b376-6b0615af58b1.txn b/.lancedb/nltk_chunking.lance/_transactions/389-ae6c7233-999d-41f4-b376-6b0615af58b1.txn new file mode 100644 index 0000000000000000000000000000000000000000..2917033a70b8d1e65b2aa0e107d900824e3af12c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/389-ae6c7233-999d-41f4-b376-6b0615af58b1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/39-8fda6eb9-4393-4796-b3e7-fd3b701c515b.txn b/.lancedb/nltk_chunking.lance/_transactions/39-8fda6eb9-4393-4796-b3e7-fd3b701c515b.txn new file mode 100644 index 0000000000000000000000000000000000000000..f994cffc35831f0a72e122b084c99b3c16d0db29 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/39-8fda6eb9-4393-4796-b3e7-fd3b701c515b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/390-072606e1-9a0e-4fff-a334-e5665fcdc7bd.txn b/.lancedb/nltk_chunking.lance/_transactions/390-072606e1-9a0e-4fff-a334-e5665fcdc7bd.txn new file mode 100644 index 0000000000000000000000000000000000000000..446727a87dbc841305fb2e13c95b272c3ffea027 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/390-072606e1-9a0e-4fff-a334-e5665fcdc7bd.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/391-d82b74df-9489-46d1-bf1a-23f32cdc5f01.txn b/.lancedb/nltk_chunking.lance/_transactions/391-d82b74df-9489-46d1-bf1a-23f32cdc5f01.txn new file mode 100644 index 0000000000000000000000000000000000000000..42d436a5330fb357f4d0a80f8e2f83ca0c0d4b95 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/391-d82b74df-9489-46d1-bf1a-23f32cdc5f01.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/392-37e7e2f5-c878-4b1e-a76e-b57e1d4a6202.txn b/.lancedb/nltk_chunking.lance/_transactions/392-37e7e2f5-c878-4b1e-a76e-b57e1d4a6202.txn new file mode 100644 index 0000000000000000000000000000000000000000..7248715bb3fe4fc8ff37df786396659e7c26b76a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/392-37e7e2f5-c878-4b1e-a76e-b57e1d4a6202.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/393-f113af19-8b08-41cc-a31b-706326ae04a0.txn b/.lancedb/nltk_chunking.lance/_transactions/393-f113af19-8b08-41cc-a31b-706326ae04a0.txn new file mode 100644 index 0000000000000000000000000000000000000000..7bca1b241e4d4e65ca211fd9f586b73787e30dd2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/393-f113af19-8b08-41cc-a31b-706326ae04a0.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/394-f7bc0253-1a57-49ba-95be-7f296179d4c1.txn b/.lancedb/nltk_chunking.lance/_transactions/394-f7bc0253-1a57-49ba-95be-7f296179d4c1.txn new file mode 100644 index 0000000000000000000000000000000000000000..37d998332bc1d03bab1c1da656e37b738269c169 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/394-f7bc0253-1a57-49ba-95be-7f296179d4c1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/395-6a9d727c-00c6-4e29-8f32-e9fe898a883f.txn b/.lancedb/nltk_chunking.lance/_transactions/395-6a9d727c-00c6-4e29-8f32-e9fe898a883f.txn new file mode 100644 index 0000000000000000000000000000000000000000..9e3770837654370360f56e142e7244f9db10ccf9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/395-6a9d727c-00c6-4e29-8f32-e9fe898a883f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/396-db9576fa-8747-4bdd-ae35-888ec2e11735.txn b/.lancedb/nltk_chunking.lance/_transactions/396-db9576fa-8747-4bdd-ae35-888ec2e11735.txn new file mode 100644 index 0000000000000000000000000000000000000000..2cfde0f6f7b1dd0ea0f760ce75ad237818e376df Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/396-db9576fa-8747-4bdd-ae35-888ec2e11735.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/397-45a3ab8d-89a2-4236-93c1-043980555168.txn b/.lancedb/nltk_chunking.lance/_transactions/397-45a3ab8d-89a2-4236-93c1-043980555168.txn new file mode 100644 index 0000000000000000000000000000000000000000..bd339a7e3493d4c93dd91d7289d052623c2bf6c2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/397-45a3ab8d-89a2-4236-93c1-043980555168.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/4-62289851-05b7-4355-bfd1-7bba30556015.txn b/.lancedb/nltk_chunking.lance/_transactions/4-62289851-05b7-4355-bfd1-7bba30556015.txn new file mode 100644 index 0000000000000000000000000000000000000000..d98821281148a2f22736ce179b68765b20d05319 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/4-62289851-05b7-4355-bfd1-7bba30556015.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/40-548e959d-12e6-42cb-9c47-4175c05d62e1.txn b/.lancedb/nltk_chunking.lance/_transactions/40-548e959d-12e6-42cb-9c47-4175c05d62e1.txn new file mode 100644 index 0000000000000000000000000000000000000000..130cb27efa42c653cc943ce616181e38910b0a1d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/40-548e959d-12e6-42cb-9c47-4175c05d62e1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/41-b1e5e13a-ad29-4f9a-b28d-ecb922b495d6.txn b/.lancedb/nltk_chunking.lance/_transactions/41-b1e5e13a-ad29-4f9a-b28d-ecb922b495d6.txn new file mode 100644 index 0000000000000000000000000000000000000000..31982e75ba19fbaa4ffef796d6524e202eca8d64 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/41-b1e5e13a-ad29-4f9a-b28d-ecb922b495d6.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/42-361da8fa-4e53-43f9-869d-4723ce0975b1.txn b/.lancedb/nltk_chunking.lance/_transactions/42-361da8fa-4e53-43f9-869d-4723ce0975b1.txn new file mode 100644 index 0000000000000000000000000000000000000000..857fea03419f27c55080a202dee1aa183b986b05 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/42-361da8fa-4e53-43f9-869d-4723ce0975b1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/43-fea964a1-bbbc-4dfc-b505-327bf25ad629.txn b/.lancedb/nltk_chunking.lance/_transactions/43-fea964a1-bbbc-4dfc-b505-327bf25ad629.txn new file mode 100644 index 0000000000000000000000000000000000000000..538c802c704599bdb32ed2a93445bbbe47d4e2e1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/43-fea964a1-bbbc-4dfc-b505-327bf25ad629.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/44-d0c3e841-a3e3-4d4a-a320-ad2aa4986299.txn b/.lancedb/nltk_chunking.lance/_transactions/44-d0c3e841-a3e3-4d4a-a320-ad2aa4986299.txn new file mode 100644 index 0000000000000000000000000000000000000000..0b3c62d14779a38f3c0390d47fba015746cd1507 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/44-d0c3e841-a3e3-4d4a-a320-ad2aa4986299.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/45-2072b085-5dc0-444c-a4e0-0b0e129c3e09.txn b/.lancedb/nltk_chunking.lance/_transactions/45-2072b085-5dc0-444c-a4e0-0b0e129c3e09.txn new file mode 100644 index 0000000000000000000000000000000000000000..6e5406421030880d07ed3298741e9171e1bf0dec Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/45-2072b085-5dc0-444c-a4e0-0b0e129c3e09.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/46-7be6460c-ac0f-4c64-bd33-127d6ccf15ba.txn b/.lancedb/nltk_chunking.lance/_transactions/46-7be6460c-ac0f-4c64-bd33-127d6ccf15ba.txn new file mode 100644 index 0000000000000000000000000000000000000000..97fe97fd81a459b20dfcf41890827f764e3bbb70 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/46-7be6460c-ac0f-4c64-bd33-127d6ccf15ba.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/47-aca92f7c-766d-4043-a866-c5ea3faeb59a.txn b/.lancedb/nltk_chunking.lance/_transactions/47-aca92f7c-766d-4043-a866-c5ea3faeb59a.txn new file mode 100644 index 0000000000000000000000000000000000000000..acc490c7a28ec8ccaedea869ecac38e825965789 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/47-aca92f7c-766d-4043-a866-c5ea3faeb59a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/48-6c6756f1-de18-4208-8b54-8928d0dd8836.txn b/.lancedb/nltk_chunking.lance/_transactions/48-6c6756f1-de18-4208-8b54-8928d0dd8836.txn new file mode 100644 index 0000000000000000000000000000000000000000..9acf7d1c97da9ea40774c1bf7bd1bc2efef04d5f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/48-6c6756f1-de18-4208-8b54-8928d0dd8836.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/49-cb68ee61-4d97-4a53-86f1-191c42eb826e.txn b/.lancedb/nltk_chunking.lance/_transactions/49-cb68ee61-4d97-4a53-86f1-191c42eb826e.txn new file mode 100644 index 0000000000000000000000000000000000000000..ff1b93a09951b26b69a1c8c94ff6ca654df3ce55 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/49-cb68ee61-4d97-4a53-86f1-191c42eb826e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/5-ee0817b0-3796-4dea-95b5-6fcf7dae5f41.txn b/.lancedb/nltk_chunking.lance/_transactions/5-ee0817b0-3796-4dea-95b5-6fcf7dae5f41.txn new file mode 100644 index 0000000000000000000000000000000000000000..72f7049057b66645a61deb65ccf00cf47bf46704 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/5-ee0817b0-3796-4dea-95b5-6fcf7dae5f41.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/50-f49131aa-7722-4aa1-aab0-7b6b30ca775b.txn b/.lancedb/nltk_chunking.lance/_transactions/50-f49131aa-7722-4aa1-aab0-7b6b30ca775b.txn new file mode 100644 index 0000000000000000000000000000000000000000..d5a9fe448617d5c580ff6235a56119ac8f8c43cf Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/50-f49131aa-7722-4aa1-aab0-7b6b30ca775b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/51-f0293d16-e607-40ce-bb56-f4211eff46f2.txn b/.lancedb/nltk_chunking.lance/_transactions/51-f0293d16-e607-40ce-bb56-f4211eff46f2.txn new file mode 100644 index 0000000000000000000000000000000000000000..8f02848e0938915f5a74a2c21a0955906b4bb2fb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/51-f0293d16-e607-40ce-bb56-f4211eff46f2.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/52-3fe07cd5-11fd-4673-9fb2-c7c4684bd950.txn b/.lancedb/nltk_chunking.lance/_transactions/52-3fe07cd5-11fd-4673-9fb2-c7c4684bd950.txn new file mode 100644 index 0000000000000000000000000000000000000000..87bf4d5fa3dc0966af7d1eb1ac2b6986d784e538 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/52-3fe07cd5-11fd-4673-9fb2-c7c4684bd950.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/53-7198c749-9760-4495-9707-72d287aeae7a.txn b/.lancedb/nltk_chunking.lance/_transactions/53-7198c749-9760-4495-9707-72d287aeae7a.txn new file mode 100644 index 0000000000000000000000000000000000000000..e743d04edbd5cc287e2bb87527dfe62667b52997 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/53-7198c749-9760-4495-9707-72d287aeae7a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/54-77998c86-8ff6-4a69-bffc-e80000578215.txn b/.lancedb/nltk_chunking.lance/_transactions/54-77998c86-8ff6-4a69-bffc-e80000578215.txn new file mode 100644 index 0000000000000000000000000000000000000000..844703895af0b62e7a72bd8335660fa26e5e90b1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/54-77998c86-8ff6-4a69-bffc-e80000578215.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/55-ce56d584-1483-435e-ac50-85040f4691ee.txn b/.lancedb/nltk_chunking.lance/_transactions/55-ce56d584-1483-435e-ac50-85040f4691ee.txn new file mode 100644 index 0000000000000000000000000000000000000000..a82844b6bcb81916366e9ae8b1f272a520abb767 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/55-ce56d584-1483-435e-ac50-85040f4691ee.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/56-525240ec-983f-40a6-b716-603576ea6ed2.txn b/.lancedb/nltk_chunking.lance/_transactions/56-525240ec-983f-40a6-b716-603576ea6ed2.txn new file mode 100644 index 0000000000000000000000000000000000000000..a60d7abd178c8646d9d7ea8af0072098aabc48b4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/56-525240ec-983f-40a6-b716-603576ea6ed2.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/57-4905ed77-c0ba-409a-a64e-e956daae6164.txn b/.lancedb/nltk_chunking.lance/_transactions/57-4905ed77-c0ba-409a-a64e-e956daae6164.txn new file mode 100644 index 0000000000000000000000000000000000000000..ad9a2c379c66f1357118eca1497f9512b999fbd6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/57-4905ed77-c0ba-409a-a64e-e956daae6164.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/58-4434f0d3-00f7-44b3-a669-7445bb276103.txn b/.lancedb/nltk_chunking.lance/_transactions/58-4434f0d3-00f7-44b3-a669-7445bb276103.txn new file mode 100644 index 0000000000000000000000000000000000000000..abd30f31d3aa8ff4916b52d65af8214b4a79dec3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/58-4434f0d3-00f7-44b3-a669-7445bb276103.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/59-d3b24bc0-8a7c-41c2-8e5b-a6f971ad8c7c.txn b/.lancedb/nltk_chunking.lance/_transactions/59-d3b24bc0-8a7c-41c2-8e5b-a6f971ad8c7c.txn new file mode 100644 index 0000000000000000000000000000000000000000..00621d9eac0f5052b4991d7bde3c244d419d493d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/59-d3b24bc0-8a7c-41c2-8e5b-a6f971ad8c7c.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/6-c038dc48-bc9a-44fb-a257-51c07fa30bda.txn b/.lancedb/nltk_chunking.lance/_transactions/6-c038dc48-bc9a-44fb-a257-51c07fa30bda.txn new file mode 100644 index 0000000000000000000000000000000000000000..e971cc6a78fbb1c66f1131b33f75bb187b4a5262 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/6-c038dc48-bc9a-44fb-a257-51c07fa30bda.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/60-92aec8d2-f2f9-4ca9-8086-fa880bf5f4ec.txn b/.lancedb/nltk_chunking.lance/_transactions/60-92aec8d2-f2f9-4ca9-8086-fa880bf5f4ec.txn new file mode 100644 index 0000000000000000000000000000000000000000..2b43e1902f3a5bb54bccfec354792e8f88807903 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/60-92aec8d2-f2f9-4ca9-8086-fa880bf5f4ec.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/61-b56f57d7-8c40-47f8-be30-fb75581ab5d6.txn b/.lancedb/nltk_chunking.lance/_transactions/61-b56f57d7-8c40-47f8-be30-fb75581ab5d6.txn new file mode 100644 index 0000000000000000000000000000000000000000..e46bdaecbbd9beb8d88f68eabe3d85247854293f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/61-b56f57d7-8c40-47f8-be30-fb75581ab5d6.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/62-c7f001af-28f9-4255-b2a9-1503c812a38f.txn b/.lancedb/nltk_chunking.lance/_transactions/62-c7f001af-28f9-4255-b2a9-1503c812a38f.txn new file mode 100644 index 0000000000000000000000000000000000000000..675c407510636b5999f3bfe027e23f685a8d4d80 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/62-c7f001af-28f9-4255-b2a9-1503c812a38f.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/63-98c028ad-ab79-4041-84df-fb035f9c2293.txn b/.lancedb/nltk_chunking.lance/_transactions/63-98c028ad-ab79-4041-84df-fb035f9c2293.txn new file mode 100644 index 0000000000000000000000000000000000000000..78059a581c51bd48a276103846c08142062a3f42 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/63-98c028ad-ab79-4041-84df-fb035f9c2293.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/64-f117ab45-4a82-4ca3-9d56-60663d562cb1.txn b/.lancedb/nltk_chunking.lance/_transactions/64-f117ab45-4a82-4ca3-9d56-60663d562cb1.txn new file mode 100644 index 0000000000000000000000000000000000000000..edb64693d714246ca06aeba90c5932f1241ee417 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/64-f117ab45-4a82-4ca3-9d56-60663d562cb1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/65-86aa4a66-9c0a-4427-bafd-3ac66e31f91b.txn b/.lancedb/nltk_chunking.lance/_transactions/65-86aa4a66-9c0a-4427-bafd-3ac66e31f91b.txn new file mode 100644 index 0000000000000000000000000000000000000000..cd00bb4304ccaf2461dcd5f5a831f5e66cffae6b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/65-86aa4a66-9c0a-4427-bafd-3ac66e31f91b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/66-faff7a53-6b2d-4b83-907a-1e898bfee3b1.txn b/.lancedb/nltk_chunking.lance/_transactions/66-faff7a53-6b2d-4b83-907a-1e898bfee3b1.txn new file mode 100644 index 0000000000000000000000000000000000000000..75c13c47325f65f78f4e20afe161de20baf5c806 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/66-faff7a53-6b2d-4b83-907a-1e898bfee3b1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/67-fd6630a7-592e-4303-b7f3-e3d6b31de930.txn b/.lancedb/nltk_chunking.lance/_transactions/67-fd6630a7-592e-4303-b7f3-e3d6b31de930.txn new file mode 100644 index 0000000000000000000000000000000000000000..e3643c047e7bb930fd7b2721db318e2c69ff4a2a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/67-fd6630a7-592e-4303-b7f3-e3d6b31de930.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/68-765aef23-f5cd-458f-b8e4-207b1b62d7fc.txn b/.lancedb/nltk_chunking.lance/_transactions/68-765aef23-f5cd-458f-b8e4-207b1b62d7fc.txn new file mode 100644 index 0000000000000000000000000000000000000000..21c49a491139019a3dc96af6f33762caf0b767e7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/68-765aef23-f5cd-458f-b8e4-207b1b62d7fc.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/69-3015058e-8b21-40df-9c9c-fc1c0878ab8c.txn b/.lancedb/nltk_chunking.lance/_transactions/69-3015058e-8b21-40df-9c9c-fc1c0878ab8c.txn new file mode 100644 index 0000000000000000000000000000000000000000..4dd82dec7c05250620b37f5c28c0fffb9f30e893 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/69-3015058e-8b21-40df-9c9c-fc1c0878ab8c.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/7-37134a4c-e2d8-4307-a4df-cb2081d8fc5b.txn b/.lancedb/nltk_chunking.lance/_transactions/7-37134a4c-e2d8-4307-a4df-cb2081d8fc5b.txn new file mode 100644 index 0000000000000000000000000000000000000000..f8fae9d6ca9adc33e551d566fdb95d8e84cf8609 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/7-37134a4c-e2d8-4307-a4df-cb2081d8fc5b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/70-74c4d13d-212e-49c3-a28f-ba47a7d2fbe8.txn b/.lancedb/nltk_chunking.lance/_transactions/70-74c4d13d-212e-49c3-a28f-ba47a7d2fbe8.txn new file mode 100644 index 0000000000000000000000000000000000000000..c71d77b007d13c179408a053b14169f26ab5dc5d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/70-74c4d13d-212e-49c3-a28f-ba47a7d2fbe8.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/71-0711e250-a78d-4a53-96fa-ade9dbe35a29.txn b/.lancedb/nltk_chunking.lance/_transactions/71-0711e250-a78d-4a53-96fa-ade9dbe35a29.txn new file mode 100644 index 0000000000000000000000000000000000000000..0ed8e3618d72b632e44ed1ba31865139a0a57d04 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/71-0711e250-a78d-4a53-96fa-ade9dbe35a29.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/72-7d615d45-83fe-46f2-bcc3-edc60da23678.txn b/.lancedb/nltk_chunking.lance/_transactions/72-7d615d45-83fe-46f2-bcc3-edc60da23678.txn new file mode 100644 index 0000000000000000000000000000000000000000..2780f0d71c957266e9fe6660c3aaf8008e97d76c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/72-7d615d45-83fe-46f2-bcc3-edc60da23678.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/73-fd17081d-e537-4664-992d-18e1056f5daf.txn b/.lancedb/nltk_chunking.lance/_transactions/73-fd17081d-e537-4664-992d-18e1056f5daf.txn new file mode 100644 index 0000000000000000000000000000000000000000..52eff6d262783081345d0c905e8eaefa884888f3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/73-fd17081d-e537-4664-992d-18e1056f5daf.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/74-c4f202a1-079b-4e6c-886a-6960881ab93a.txn b/.lancedb/nltk_chunking.lance/_transactions/74-c4f202a1-079b-4e6c-886a-6960881ab93a.txn new file mode 100644 index 0000000000000000000000000000000000000000..1d784ec098bc1ac6e4e5b0f7677d26ec8d17e76e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/74-c4f202a1-079b-4e6c-886a-6960881ab93a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/75-f918234d-3de0-44c2-8396-9391a2877652.txn b/.lancedb/nltk_chunking.lance/_transactions/75-f918234d-3de0-44c2-8396-9391a2877652.txn new file mode 100644 index 0000000000000000000000000000000000000000..ba7ce8cbce22b0167419bd3b29cc7dd93b33ff3d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/75-f918234d-3de0-44c2-8396-9391a2877652.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/76-a982516f-c0f7-4b05-8214-906bc89f6b25.txn b/.lancedb/nltk_chunking.lance/_transactions/76-a982516f-c0f7-4b05-8214-906bc89f6b25.txn new file mode 100644 index 0000000000000000000000000000000000000000..ffc584f0b1ba1221410a840396af25d8c34e0f34 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/76-a982516f-c0f7-4b05-8214-906bc89f6b25.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/77-c302bc51-cd33-49f1-8062-5d2aa4a6454d.txn b/.lancedb/nltk_chunking.lance/_transactions/77-c302bc51-cd33-49f1-8062-5d2aa4a6454d.txn new file mode 100644 index 0000000000000000000000000000000000000000..80908b1f980150753aca9e2cbc4779b9a3cbabd8 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/77-c302bc51-cd33-49f1-8062-5d2aa4a6454d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/78-7a76a1c8-ee8a-4f8a-8e54-3c961b484e64.txn b/.lancedb/nltk_chunking.lance/_transactions/78-7a76a1c8-ee8a-4f8a-8e54-3c961b484e64.txn new file mode 100644 index 0000000000000000000000000000000000000000..015c757dc0aef52e9160a94bd2826365a9a9f68c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/78-7a76a1c8-ee8a-4f8a-8e54-3c961b484e64.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/79-18d988bd-dc9a-49fd-a27b-7c7e184de499.txn b/.lancedb/nltk_chunking.lance/_transactions/79-18d988bd-dc9a-49fd-a27b-7c7e184de499.txn new file mode 100644 index 0000000000000000000000000000000000000000..514d2b25ea6dc042c47483ad9197749720178e02 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/79-18d988bd-dc9a-49fd-a27b-7c7e184de499.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/8-7b02f933-c9b9-4240-9d6d-37af1ed0d3fc.txn b/.lancedb/nltk_chunking.lance/_transactions/8-7b02f933-c9b9-4240-9d6d-37af1ed0d3fc.txn new file mode 100644 index 0000000000000000000000000000000000000000..6b52ed9c2977f698226510206c0e265758d73fbb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/8-7b02f933-c9b9-4240-9d6d-37af1ed0d3fc.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/80-c730c277-5a82-4ff1-818d-98cdc51c6f93.txn b/.lancedb/nltk_chunking.lance/_transactions/80-c730c277-5a82-4ff1-818d-98cdc51c6f93.txn new file mode 100644 index 0000000000000000000000000000000000000000..abdc24b9e269f5a02f40e0cd998877c61a61a7ff Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/80-c730c277-5a82-4ff1-818d-98cdc51c6f93.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/81-1306f1cd-71d3-47ea-8e35-98d7c29f315a.txn b/.lancedb/nltk_chunking.lance/_transactions/81-1306f1cd-71d3-47ea-8e35-98d7c29f315a.txn new file mode 100644 index 0000000000000000000000000000000000000000..30a5dd9309b75a7beaa9083b8aefd9099d112bfa Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/81-1306f1cd-71d3-47ea-8e35-98d7c29f315a.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/82-04b0f29a-2d51-49dd-aa08-7c2be31c8838.txn b/.lancedb/nltk_chunking.lance/_transactions/82-04b0f29a-2d51-49dd-aa08-7c2be31c8838.txn new file mode 100644 index 0000000000000000000000000000000000000000..5b1eaf821b8d74c2bfc78418ff2d099605f65092 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/82-04b0f29a-2d51-49dd-aa08-7c2be31c8838.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/83-c01c195f-0ef9-4ddf-881e-034861c6310d.txn b/.lancedb/nltk_chunking.lance/_transactions/83-c01c195f-0ef9-4ddf-881e-034861c6310d.txn new file mode 100644 index 0000000000000000000000000000000000000000..0826a3e2dae5862a6b40333d5084ff311e743acb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/83-c01c195f-0ef9-4ddf-881e-034861c6310d.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/84-ac03ecb1-6957-4140-9e6a-5d90b5a199f6.txn b/.lancedb/nltk_chunking.lance/_transactions/84-ac03ecb1-6957-4140-9e6a-5d90b5a199f6.txn new file mode 100644 index 0000000000000000000000000000000000000000..d21032cae0816a3ea21216b0c1de5feef2845238 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/84-ac03ecb1-6957-4140-9e6a-5d90b5a199f6.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/85-4bb5e78b-1359-483f-bc11-3d748e43b0ad.txn b/.lancedb/nltk_chunking.lance/_transactions/85-4bb5e78b-1359-483f-bc11-3d748e43b0ad.txn new file mode 100644 index 0000000000000000000000000000000000000000..e09e49d64531e10f01e3a1927b5848bcfa35c706 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/85-4bb5e78b-1359-483f-bc11-3d748e43b0ad.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/86-8e9cd1f1-29fc-4143-b1ac-2608da7a9812.txn b/.lancedb/nltk_chunking.lance/_transactions/86-8e9cd1f1-29fc-4143-b1ac-2608da7a9812.txn new file mode 100644 index 0000000000000000000000000000000000000000..b55b5a7e1c30dfc74920ac2faf73faf81aef409e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/86-8e9cd1f1-29fc-4143-b1ac-2608da7a9812.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/87-5edc845c-efb6-43ec-89a5-73d2cf5bd502.txn b/.lancedb/nltk_chunking.lance/_transactions/87-5edc845c-efb6-43ec-89a5-73d2cf5bd502.txn new file mode 100644 index 0000000000000000000000000000000000000000..a20cf05a7a0dc936d113392cbef80ba648af5123 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/87-5edc845c-efb6-43ec-89a5-73d2cf5bd502.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/88-88aa0b8a-0295-4be6-bac0-ada1ef0c91b3.txn b/.lancedb/nltk_chunking.lance/_transactions/88-88aa0b8a-0295-4be6-bac0-ada1ef0c91b3.txn new file mode 100644 index 0000000000000000000000000000000000000000..64a2d3260886eb1a8a7cf61e2d701218e96ad7db Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/88-88aa0b8a-0295-4be6-bac0-ada1ef0c91b3.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/89-d36a19e5-cd6a-4bb2-a269-7a085e505dd3.txn b/.lancedb/nltk_chunking.lance/_transactions/89-d36a19e5-cd6a-4bb2-a269-7a085e505dd3.txn new file mode 100644 index 0000000000000000000000000000000000000000..bb1ecec51ef1353d64251d8d5cd77a81a9a9bb79 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/89-d36a19e5-cd6a-4bb2-a269-7a085e505dd3.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/9-1d64fd97-fb57-41f7-9ebf-866e7e5a6f2b.txn b/.lancedb/nltk_chunking.lance/_transactions/9-1d64fd97-fb57-41f7-9ebf-866e7e5a6f2b.txn new file mode 100644 index 0000000000000000000000000000000000000000..09e5157406a5a45f30ca055913d17962d58bb587 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/9-1d64fd97-fb57-41f7-9ebf-866e7e5a6f2b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/90-7c1918be-daa4-4e95-bc19-58823d39701b.txn b/.lancedb/nltk_chunking.lance/_transactions/90-7c1918be-daa4-4e95-bc19-58823d39701b.txn new file mode 100644 index 0000000000000000000000000000000000000000..c2ff012d1657892daa8c308ea9bf969861d618ad Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/90-7c1918be-daa4-4e95-bc19-58823d39701b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/91-432a159d-cb68-47b9-ba5f-454d70e74761.txn b/.lancedb/nltk_chunking.lance/_transactions/91-432a159d-cb68-47b9-ba5f-454d70e74761.txn new file mode 100644 index 0000000000000000000000000000000000000000..9ace4a2d041480ff0bdaa5bf33bd434a7039a1d6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/91-432a159d-cb68-47b9-ba5f-454d70e74761.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/92-fbddd305-f341-408d-b707-5a6d48d0075b.txn b/.lancedb/nltk_chunking.lance/_transactions/92-fbddd305-f341-408d-b707-5a6d48d0075b.txn new file mode 100644 index 0000000000000000000000000000000000000000..a1851448a33cb27e9d893e0f7aa3fc4255c53f46 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/92-fbddd305-f341-408d-b707-5a6d48d0075b.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/93-e5a263d0-f479-406d-9170-a65e9b568a6e.txn b/.lancedb/nltk_chunking.lance/_transactions/93-e5a263d0-f479-406d-9170-a65e9b568a6e.txn new file mode 100644 index 0000000000000000000000000000000000000000..e6cadff65a611386ba9b987a4e2ae74b0f8f8b4b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/93-e5a263d0-f479-406d-9170-a65e9b568a6e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/94-0f8caa38-788c-44b1-9e58-4e1f7f4cbd0e.txn b/.lancedb/nltk_chunking.lance/_transactions/94-0f8caa38-788c-44b1-9e58-4e1f7f4cbd0e.txn new file mode 100644 index 0000000000000000000000000000000000000000..592690de76e055a14de8331525a96fd3b5f65e4f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/94-0f8caa38-788c-44b1-9e58-4e1f7f4cbd0e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/95-c6e19c38-ce8f-4b8a-ae56-53cec08705f4.txn b/.lancedb/nltk_chunking.lance/_transactions/95-c6e19c38-ce8f-4b8a-ae56-53cec08705f4.txn new file mode 100644 index 0000000000000000000000000000000000000000..802cc10e91f569dd22d08277f961a5d87d90116f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/95-c6e19c38-ce8f-4b8a-ae56-53cec08705f4.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/96-c696de33-24b0-432d-9dce-cbba07e0d878.txn b/.lancedb/nltk_chunking.lance/_transactions/96-c696de33-24b0-432d-9dce-cbba07e0d878.txn new file mode 100644 index 0000000000000000000000000000000000000000..5fdd95ea5ebfb78963633b3826dc067364925a22 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/96-c696de33-24b0-432d-9dce-cbba07e0d878.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/97-876f322d-469f-435a-9f86-f9f34c52c5f1.txn b/.lancedb/nltk_chunking.lance/_transactions/97-876f322d-469f-435a-9f86-f9f34c52c5f1.txn new file mode 100644 index 0000000000000000000000000000000000000000..3f452283ccbb59e521c982a5f9dacb88274a15de Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/97-876f322d-469f-435a-9f86-f9f34c52c5f1.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/98-2675d6b9-3fac-47a6-a698-2c93d306500e.txn b/.lancedb/nltk_chunking.lance/_transactions/98-2675d6b9-3fac-47a6-a698-2c93d306500e.txn new file mode 100644 index 0000000000000000000000000000000000000000..14c1b35836f2c2a37d1caabcdae608109d397fc4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/98-2675d6b9-3fac-47a6-a698-2c93d306500e.txn differ diff --git a/.lancedb/nltk_chunking.lance/_transactions/99-234b6346-f2d1-4fcb-8537-0af213769af7.txn b/.lancedb/nltk_chunking.lance/_transactions/99-234b6346-f2d1-4fcb-8537-0af213769af7.txn new file mode 100644 index 0000000000000000000000000000000000000000..c22cbee3941a283d08a8cdb5e2ba85cbd693747e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_transactions/99-234b6346-f2d1-4fcb-8537-0af213769af7.txn differ diff --git a/.lancedb/nltk_chunking.lance/_versions/1.manifest b/.lancedb/nltk_chunking.lance/_versions/1.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cd2060a167b7926e7f4eb6a862f3893d1eb12484 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/1.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/10.manifest b/.lancedb/nltk_chunking.lance/_versions/10.manifest new file mode 100644 index 0000000000000000000000000000000000000000..07c8b1cb8b5ed86800c18c983458dbf92cb492c1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/10.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/100.manifest b/.lancedb/nltk_chunking.lance/_versions/100.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f3916fa1ba63c65c20db969ed274c60eef037074 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/100.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/101.manifest b/.lancedb/nltk_chunking.lance/_versions/101.manifest new file mode 100644 index 0000000000000000000000000000000000000000..04fb69001d3fe5b17bee7091f029ad945b9a3c07 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/101.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/102.manifest b/.lancedb/nltk_chunking.lance/_versions/102.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f76aff1df41c0b7161ad06301154c4d558ef08c3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/102.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/103.manifest b/.lancedb/nltk_chunking.lance/_versions/103.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1ca6dcbe279d61cd92b578ef3453c50125141a66 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/103.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/104.manifest b/.lancedb/nltk_chunking.lance/_versions/104.manifest new file mode 100644 index 0000000000000000000000000000000000000000..40a8e46a6216523efde01f4147ad78847df1ed8e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/104.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/105.manifest b/.lancedb/nltk_chunking.lance/_versions/105.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2b4e3d7fca07426284a861fec3c2a7b3d01fbcc2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/105.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/106.manifest b/.lancedb/nltk_chunking.lance/_versions/106.manifest new file mode 100644 index 0000000000000000000000000000000000000000..15f510aade1660603a83854d8afb93173d687927 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/106.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/107.manifest b/.lancedb/nltk_chunking.lance/_versions/107.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1c546557c75ccf82179cbc48f89d3b68eed9e81a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/107.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/108.manifest b/.lancedb/nltk_chunking.lance/_versions/108.manifest new file mode 100644 index 0000000000000000000000000000000000000000..abd2eb7eed9b615e61e9a9f8983d55f3e6abf34a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/108.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/109.manifest b/.lancedb/nltk_chunking.lance/_versions/109.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1a695e120239e25eca5b10a87e336baa8b365bf7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/109.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/11.manifest b/.lancedb/nltk_chunking.lance/_versions/11.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bf7cb13e4994271ed8bf55fc821e7d99912a2494 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/11.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/110.manifest b/.lancedb/nltk_chunking.lance/_versions/110.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8345587a2379c71b79030c50593e79c6c9b809fc Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/110.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/111.manifest b/.lancedb/nltk_chunking.lance/_versions/111.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c687077b4d3fc82a93ab2be39802198b7b3009eb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/111.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/112.manifest b/.lancedb/nltk_chunking.lance/_versions/112.manifest new file mode 100644 index 0000000000000000000000000000000000000000..220f88ffe5f2fcc446f2c5354883f473a9489868 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/112.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/113.manifest b/.lancedb/nltk_chunking.lance/_versions/113.manifest new file mode 100644 index 0000000000000000000000000000000000000000..df90cfb71efdf012f2caf43733a4a8f940844e9f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/113.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/114.manifest b/.lancedb/nltk_chunking.lance/_versions/114.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d07a8ad5e0182a5d8925ad1cc2ce711ca3a69d2a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/114.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/115.manifest b/.lancedb/nltk_chunking.lance/_versions/115.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4dcd61e814f8afe22408a7b8d0c98e51720fbfc1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/115.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/116.manifest b/.lancedb/nltk_chunking.lance/_versions/116.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b58639675ddf20b67e7c3e1c8a71baa3e19556ba Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/116.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/117.manifest b/.lancedb/nltk_chunking.lance/_versions/117.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c0cf329f688d7a400cdab7314dfc0f0932f50524 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/117.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/118.manifest b/.lancedb/nltk_chunking.lance/_versions/118.manifest new file mode 100644 index 0000000000000000000000000000000000000000..05bcc822824d2938bda5cbbd290634868d285672 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/118.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/119.manifest b/.lancedb/nltk_chunking.lance/_versions/119.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dd14e48603b339e8dbc122b7084d246eb9aecfc0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/119.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/12.manifest b/.lancedb/nltk_chunking.lance/_versions/12.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a9fef035168637092dbaf5138f880c1c20b6097a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/12.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/120.manifest b/.lancedb/nltk_chunking.lance/_versions/120.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fc8af0b6d8457e2b053e10c5674fb1cecfc3a858 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/120.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/121.manifest b/.lancedb/nltk_chunking.lance/_versions/121.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fb07e8cecccca512b94be8f41661013c04bdb7b3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/121.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/122.manifest b/.lancedb/nltk_chunking.lance/_versions/122.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c8600c51bfa6c7398c615b0f694c316f4109b5bb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/122.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/123.manifest b/.lancedb/nltk_chunking.lance/_versions/123.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4af0cdeb88548895713be6117386c51b6c4d8bec Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/123.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/124.manifest b/.lancedb/nltk_chunking.lance/_versions/124.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4c65cf28fa346f582ef0e6dcfc93809f148a1a5b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/124.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/125.manifest b/.lancedb/nltk_chunking.lance/_versions/125.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a96d499307e740f70ec07c8e76dfcddd25186d4e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/125.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/126.manifest b/.lancedb/nltk_chunking.lance/_versions/126.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b022d62d5f374f313a1dda9e117873a2c4917632 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/126.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/127.manifest b/.lancedb/nltk_chunking.lance/_versions/127.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b728c3be1d51e2858242a17f5a057e93f5d3ac08 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/127.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/128.manifest b/.lancedb/nltk_chunking.lance/_versions/128.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f07213a4af0bf7cbfa9035b0c3c6ba154bd4dbb3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/128.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/129.manifest b/.lancedb/nltk_chunking.lance/_versions/129.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9d9a6c02f10868b065bfce3b35136c659ca051ed Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/129.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/13.manifest b/.lancedb/nltk_chunking.lance/_versions/13.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7e70aa1b614e52ff1d0c3ad97ed413e214629010 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/13.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/130.manifest b/.lancedb/nltk_chunking.lance/_versions/130.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ff02aa78b916b4eada1ac125ed17397d82ae8fe3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/130.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/131.manifest b/.lancedb/nltk_chunking.lance/_versions/131.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7a654986815a52534b83078b268a8b6d233616b3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/131.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/132.manifest b/.lancedb/nltk_chunking.lance/_versions/132.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0bb6b40a95b105876ad00e9773572d20069ef06b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/132.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/133.manifest b/.lancedb/nltk_chunking.lance/_versions/133.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cf580deacd4cd4455fc7a06fcf75116dd71e3527 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/133.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/134.manifest b/.lancedb/nltk_chunking.lance/_versions/134.manifest new file mode 100644 index 0000000000000000000000000000000000000000..42c1486cdd530386a003dd2601f4ceca0aaa0366 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/134.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/135.manifest b/.lancedb/nltk_chunking.lance/_versions/135.manifest new file mode 100644 index 0000000000000000000000000000000000000000..49f35f4db9af49ad9db507b0fbbf2198879865dd Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/135.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/136.manifest b/.lancedb/nltk_chunking.lance/_versions/136.manifest new file mode 100644 index 0000000000000000000000000000000000000000..09d016b7433444e881f389e91d0999ead8355a8b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/136.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/137.manifest b/.lancedb/nltk_chunking.lance/_versions/137.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0b7ffbf882f0f0597de5fe5d7fb668a420790952 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/137.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/138.manifest b/.lancedb/nltk_chunking.lance/_versions/138.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a0a0cd9210fb39833b84bc51603ad72dc0e233f9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/138.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/139.manifest b/.lancedb/nltk_chunking.lance/_versions/139.manifest new file mode 100644 index 0000000000000000000000000000000000000000..76dfbc4d28f7f25240ab9c21877ad57181a365d6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/139.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/14.manifest b/.lancedb/nltk_chunking.lance/_versions/14.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d788e6e95e58c3b45e42b84ebf6d5812de6176ea Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/14.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/140.manifest b/.lancedb/nltk_chunking.lance/_versions/140.manifest new file mode 100644 index 0000000000000000000000000000000000000000..eb7ed5f648a7a61292656d26eafc713ad6227897 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/140.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/141.manifest b/.lancedb/nltk_chunking.lance/_versions/141.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fad9f8a0060c28782ffaa09a60d8289c89e53189 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/141.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/142.manifest b/.lancedb/nltk_chunking.lance/_versions/142.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b989b72346083dd624f48749b8949d8abf528789 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/142.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/143.manifest b/.lancedb/nltk_chunking.lance/_versions/143.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cfc36c7600377e1ab148586ec484bad157bac1dd Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/143.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/144.manifest b/.lancedb/nltk_chunking.lance/_versions/144.manifest new file mode 100644 index 0000000000000000000000000000000000000000..419291fe0b303a43d52dcf068bfaaf9b94fa1e3f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/144.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/145.manifest b/.lancedb/nltk_chunking.lance/_versions/145.manifest new file mode 100644 index 0000000000000000000000000000000000000000..28bcd020c75528f711ea6bc4a42982b782612917 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/145.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/146.manifest b/.lancedb/nltk_chunking.lance/_versions/146.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7dc7fb3eafe8e109244705c483024dc6db575ea6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/146.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/147.manifest b/.lancedb/nltk_chunking.lance/_versions/147.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4a1b4cec325f1130dbbeb45a69acff99872a2d2d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/147.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/148.manifest b/.lancedb/nltk_chunking.lance/_versions/148.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b1a5d3b4e7666b30f6db0aa735b7a040322e1a8c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/148.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/149.manifest b/.lancedb/nltk_chunking.lance/_versions/149.manifest new file mode 100644 index 0000000000000000000000000000000000000000..888f09468bd1c3671c23c2b13f14bbedef89a73f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/149.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/15.manifest b/.lancedb/nltk_chunking.lance/_versions/15.manifest new file mode 100644 index 0000000000000000000000000000000000000000..77be49482fd2a95f077f268dc61f67451fdf7a2e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/15.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/150.manifest b/.lancedb/nltk_chunking.lance/_versions/150.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c397f95728861b6e393bc44e5a40dc2fef1254ee Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/150.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/151.manifest b/.lancedb/nltk_chunking.lance/_versions/151.manifest new file mode 100644 index 0000000000000000000000000000000000000000..12ba4b59405ad8465f8d362c352d9df605aad9de Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/151.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/152.manifest b/.lancedb/nltk_chunking.lance/_versions/152.manifest new file mode 100644 index 0000000000000000000000000000000000000000..06d8f4c3c0fb1d62fa1141062c56ff6fdb00279f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/152.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/153.manifest b/.lancedb/nltk_chunking.lance/_versions/153.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3d8990034c5cfbc84cee233f89a600d576ac5719 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/153.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/154.manifest b/.lancedb/nltk_chunking.lance/_versions/154.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cb01168d572292b7321e60a5b2e4f2ef5f855833 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/154.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/155.manifest b/.lancedb/nltk_chunking.lance/_versions/155.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ad41c3aa73a89da6df935a4f46c2adf01bcfb6b1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/155.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/156.manifest b/.lancedb/nltk_chunking.lance/_versions/156.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1fd03b036b0f29df8c422773b2b1853506103e69 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/156.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/157.manifest b/.lancedb/nltk_chunking.lance/_versions/157.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ffc2332b8dce91081dc732cc112b9255d6130eac Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/157.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/158.manifest b/.lancedb/nltk_chunking.lance/_versions/158.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f2eda5a790f9455bf33b17d04a643025d1511ce2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/158.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/159.manifest b/.lancedb/nltk_chunking.lance/_versions/159.manifest new file mode 100644 index 0000000000000000000000000000000000000000..640889190293ab40d04e10c567b259d689a65870 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/159.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/16.manifest b/.lancedb/nltk_chunking.lance/_versions/16.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2dae9ed83487982fe8a23c786ecbd152517ec92c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/16.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/160.manifest b/.lancedb/nltk_chunking.lance/_versions/160.manifest new file mode 100644 index 0000000000000000000000000000000000000000..871aec5571efa451139ad9e98255ceb89d53671c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/160.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/161.manifest b/.lancedb/nltk_chunking.lance/_versions/161.manifest new file mode 100644 index 0000000000000000000000000000000000000000..99fd0a49f583ddf7a3799b4185d13f23cf1e27fd Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/161.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/162.manifest b/.lancedb/nltk_chunking.lance/_versions/162.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e722f798daa0c936ec765b3e96c240e27736cc82 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/162.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/163.manifest b/.lancedb/nltk_chunking.lance/_versions/163.manifest new file mode 100644 index 0000000000000000000000000000000000000000..47be6ba69e359546173683673e105d4f8944bb4a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/163.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/164.manifest b/.lancedb/nltk_chunking.lance/_versions/164.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a375c34ff6bd5e3e13af16f3914f5b451e114045 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/164.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/165.manifest b/.lancedb/nltk_chunking.lance/_versions/165.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9bce73317c635c29ab5737a7a5db0b45bfa19379 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/165.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/166.manifest b/.lancedb/nltk_chunking.lance/_versions/166.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1352fd33cf31a166cc575f9d11ce5dc12e926ef8 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/166.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/167.manifest b/.lancedb/nltk_chunking.lance/_versions/167.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1a293e62504eb4afe7ff951523fae7e97f3e9216 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/167.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/168.manifest b/.lancedb/nltk_chunking.lance/_versions/168.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8cb9a86c4ee0a6e24de0b227b9924495e63fce25 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/168.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/169.manifest b/.lancedb/nltk_chunking.lance/_versions/169.manifest new file mode 100644 index 0000000000000000000000000000000000000000..796faeea020b09c0b560be025e0b0bfa9ced6929 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/169.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/17.manifest b/.lancedb/nltk_chunking.lance/_versions/17.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c48a31a12af1f48753f28c3073c595e52074f9f7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/17.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/170.manifest b/.lancedb/nltk_chunking.lance/_versions/170.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e2e3151d1735e826618f81b704e69e06a3ee8767 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/170.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/171.manifest b/.lancedb/nltk_chunking.lance/_versions/171.manifest new file mode 100644 index 0000000000000000000000000000000000000000..489cccd44a95c1c4b0b22829baa76f88a45615c5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/171.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/172.manifest b/.lancedb/nltk_chunking.lance/_versions/172.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3edeeb1c0de82b292b10b0009943d29d5669ddb6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/172.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/173.manifest b/.lancedb/nltk_chunking.lance/_versions/173.manifest new file mode 100644 index 0000000000000000000000000000000000000000..394885114bc497b98ad685d1ecd6dadea5aa19dc Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/173.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/174.manifest b/.lancedb/nltk_chunking.lance/_versions/174.manifest new file mode 100644 index 0000000000000000000000000000000000000000..afce1ce9eb9e037af471520b168f25de0f0d1ba7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/174.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/175.manifest b/.lancedb/nltk_chunking.lance/_versions/175.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8d72c1194294944a4511ccbb95b17dd18d186b45 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/175.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/176.manifest b/.lancedb/nltk_chunking.lance/_versions/176.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6269595e284425a0dc88318a422a72664fc20842 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/176.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/177.manifest b/.lancedb/nltk_chunking.lance/_versions/177.manifest new file mode 100644 index 0000000000000000000000000000000000000000..21d2f7b803434bbe66cc983d60079fe7fbe18e9a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/177.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/178.manifest b/.lancedb/nltk_chunking.lance/_versions/178.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e4e8acf345299be437ed7ae8c718d2927fea1912 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/178.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/179.manifest b/.lancedb/nltk_chunking.lance/_versions/179.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d07d1c7058acd58f5392c1895e89c9242f6a83d2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/179.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/18.manifest b/.lancedb/nltk_chunking.lance/_versions/18.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fd61103bd12b5f03aad5c187943995844f06d34d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/18.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/180.manifest b/.lancedb/nltk_chunking.lance/_versions/180.manifest new file mode 100644 index 0000000000000000000000000000000000000000..197c0159a7adadf383078f097250277aa144a7fb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/180.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/181.manifest b/.lancedb/nltk_chunking.lance/_versions/181.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5649907a2a1302c021d63b15c577c07cfd970cec Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/181.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/182.manifest b/.lancedb/nltk_chunking.lance/_versions/182.manifest new file mode 100644 index 0000000000000000000000000000000000000000..73d13c113ed11849aefd340edbfa1e4425de8f06 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/182.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/183.manifest b/.lancedb/nltk_chunking.lance/_versions/183.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5ac7728470149627f827678fc542ad1aa3a577f9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/183.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/184.manifest b/.lancedb/nltk_chunking.lance/_versions/184.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b5926e6ac0107229e29c9132ff2cf25f301488e3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/184.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/185.manifest b/.lancedb/nltk_chunking.lance/_versions/185.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5e12cce259a0c7263f984970d4e8e0b2bc2f6155 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/185.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/186.manifest b/.lancedb/nltk_chunking.lance/_versions/186.manifest new file mode 100644 index 0000000000000000000000000000000000000000..82f5f684c0dd8350e6ad7c7201ede4e5254a5d22 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/186.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/187.manifest b/.lancedb/nltk_chunking.lance/_versions/187.manifest new file mode 100644 index 0000000000000000000000000000000000000000..95a884045b279ef54764b0ecb2fec274e74fc5ae Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/187.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/188.manifest b/.lancedb/nltk_chunking.lance/_versions/188.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2140b6e57173e83114a017a14cd00a5f51f35720 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/188.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/189.manifest b/.lancedb/nltk_chunking.lance/_versions/189.manifest new file mode 100644 index 0000000000000000000000000000000000000000..54d745a5415ce7d90c25e94a7677f52940f55158 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/189.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/19.manifest b/.lancedb/nltk_chunking.lance/_versions/19.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1e1be226d17c7a2d27e9c0a99a8c48995a48eb4e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/19.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/190.manifest b/.lancedb/nltk_chunking.lance/_versions/190.manifest new file mode 100644 index 0000000000000000000000000000000000000000..85972381e9cd0d6bdfdc3517843878b69865ee98 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/190.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/191.manifest b/.lancedb/nltk_chunking.lance/_versions/191.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e0624714a8978ed771042509693ffd86cbd8b8f5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/191.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/192.manifest b/.lancedb/nltk_chunking.lance/_versions/192.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4538ce35142602126749b9f49e8e524af6e9e178 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/192.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/193.manifest b/.lancedb/nltk_chunking.lance/_versions/193.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d679d5f9fa129624dfa49b1c3d59de369617ddbb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/193.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/194.manifest b/.lancedb/nltk_chunking.lance/_versions/194.manifest new file mode 100644 index 0000000000000000000000000000000000000000..feea820302f4a740ee77eccf1d6b7c44422132c6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/194.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/195.manifest b/.lancedb/nltk_chunking.lance/_versions/195.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0fa80b6373c82dea93b0e183be687af0280e6e36 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/195.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/196.manifest b/.lancedb/nltk_chunking.lance/_versions/196.manifest new file mode 100644 index 0000000000000000000000000000000000000000..95ed43ebda08ff91910686b24b023fb886c1d12e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/196.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/197.manifest b/.lancedb/nltk_chunking.lance/_versions/197.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5e864a82b7e99935b8b5b021fb767cb47f2c4488 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/197.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/198.manifest b/.lancedb/nltk_chunking.lance/_versions/198.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9ba1c499c532d7b6be5d9a7504dc760de95c2315 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/198.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/199.manifest b/.lancedb/nltk_chunking.lance/_versions/199.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0613c19a89f0d891f7452a37d83164e0b913f8f1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/199.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/2.manifest b/.lancedb/nltk_chunking.lance/_versions/2.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4f40cd788e1b105981815f831fd4912e54e3a269 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/2.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/20.manifest b/.lancedb/nltk_chunking.lance/_versions/20.manifest new file mode 100644 index 0000000000000000000000000000000000000000..98290235b9e3b16e952340f5b21e8361b5b85e62 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/20.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/200.manifest b/.lancedb/nltk_chunking.lance/_versions/200.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e419a8b9c14fffd167bec6c8ed626bf85c69c7a9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/200.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/201.manifest b/.lancedb/nltk_chunking.lance/_versions/201.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d6c990e63c60544c9bacc4104d4f4855c6b3a77e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/201.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/202.manifest b/.lancedb/nltk_chunking.lance/_versions/202.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b6ea27726475764efe63a66a8647beea7d7db81f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/202.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/203.manifest b/.lancedb/nltk_chunking.lance/_versions/203.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d9e4572aec441ddc1544aa6b2955f378c6510eb4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/203.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/204.manifest b/.lancedb/nltk_chunking.lance/_versions/204.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1aa66f2cb7c02af4604637f4702c55351c9043d2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/204.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/205.manifest b/.lancedb/nltk_chunking.lance/_versions/205.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3a2698827a5474871a25b9c2972c4201bf1bb0be Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/205.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/206.manifest b/.lancedb/nltk_chunking.lance/_versions/206.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8a42235e5dce2b7c86fad3dd73b3c5d1b8061608 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/206.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/207.manifest b/.lancedb/nltk_chunking.lance/_versions/207.manifest new file mode 100644 index 0000000000000000000000000000000000000000..913d6cf52c7d9c5b3d2f77a0fa5194d4f5f18672 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/207.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/208.manifest b/.lancedb/nltk_chunking.lance/_versions/208.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a56e7d9e416dce721859092ac6aaec05fadbc35a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/208.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/209.manifest b/.lancedb/nltk_chunking.lance/_versions/209.manifest new file mode 100644 index 0000000000000000000000000000000000000000..436f39ff003a6b953f7171f743d22bf22e951cc6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/209.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/21.manifest b/.lancedb/nltk_chunking.lance/_versions/21.manifest new file mode 100644 index 0000000000000000000000000000000000000000..89a0380132a5f12f39904a5199109e9560a9b741 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/21.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/210.manifest b/.lancedb/nltk_chunking.lance/_versions/210.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a93db8c94a1ea1c7ee1325cdbb6f58bc5f0d83c0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/210.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/211.manifest b/.lancedb/nltk_chunking.lance/_versions/211.manifest new file mode 100644 index 0000000000000000000000000000000000000000..58e0d4f54d93cb2e3c00d0ccecbc64deac8b883c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/211.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/212.manifest b/.lancedb/nltk_chunking.lance/_versions/212.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6f9d75131c4bd103ac8a8402af303f2741bd5638 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/212.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/213.manifest b/.lancedb/nltk_chunking.lance/_versions/213.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5a8bdb5f7960be3f1ce8a93d5cb40d801fe1f759 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/213.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/214.manifest b/.lancedb/nltk_chunking.lance/_versions/214.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d1cee514d9704e32a5507237e212905d63434650 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/214.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/215.manifest b/.lancedb/nltk_chunking.lance/_versions/215.manifest new file mode 100644 index 0000000000000000000000000000000000000000..362fa3c9d8be508c785e37ee8af05d2fc75bfc60 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/215.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/216.manifest b/.lancedb/nltk_chunking.lance/_versions/216.manifest new file mode 100644 index 0000000000000000000000000000000000000000..00bff5e7602a8dc441db92ac797c2632b315411e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/216.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/217.manifest b/.lancedb/nltk_chunking.lance/_versions/217.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e555ca4be606f11006a26a97fd905316f2944248 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/217.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/218.manifest b/.lancedb/nltk_chunking.lance/_versions/218.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a6e7b1bcac5ee235553b22a33298d14827b4992d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/218.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/219.manifest b/.lancedb/nltk_chunking.lance/_versions/219.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5f269629d904b5b364e428740a4031805aa2594a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/219.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/22.manifest b/.lancedb/nltk_chunking.lance/_versions/22.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ea490f72df37f74096254f654c9c5acc71739294 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/22.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/220.manifest b/.lancedb/nltk_chunking.lance/_versions/220.manifest new file mode 100644 index 0000000000000000000000000000000000000000..af9fa88dbe6f4e34448a653ba108d7d2b121414c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/220.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/221.manifest b/.lancedb/nltk_chunking.lance/_versions/221.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a3288b4ce3e477c6b94f8a80a7a71e0c39012278 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/221.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/222.manifest b/.lancedb/nltk_chunking.lance/_versions/222.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d7b866553f2106fba5a284428d96073b11402be8 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/222.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/223.manifest b/.lancedb/nltk_chunking.lance/_versions/223.manifest new file mode 100644 index 0000000000000000000000000000000000000000..15d1b3f0b5a48301d347dcfed0acd2e5ba6de922 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/223.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/224.manifest b/.lancedb/nltk_chunking.lance/_versions/224.manifest new file mode 100644 index 0000000000000000000000000000000000000000..79f3a7cc0bcca1de50ab14b40ef1d833c18aa638 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/224.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/225.manifest b/.lancedb/nltk_chunking.lance/_versions/225.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7a2b44bd1224e65fe9c68add9f36170e0b050a53 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/225.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/226.manifest b/.lancedb/nltk_chunking.lance/_versions/226.manifest new file mode 100644 index 0000000000000000000000000000000000000000..12699d7c4d67da9a896731f68129d26dd634cf8f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/226.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/227.manifest b/.lancedb/nltk_chunking.lance/_versions/227.manifest new file mode 100644 index 0000000000000000000000000000000000000000..60e9afe6b9b4ed235881688ca9fe0a96a6ebedef Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/227.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/228.manifest b/.lancedb/nltk_chunking.lance/_versions/228.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b6932b39cda61d71de418d220da0824369a6d2b4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/228.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/229.manifest b/.lancedb/nltk_chunking.lance/_versions/229.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a778f626b244ff5190852b524b17fd8151abec75 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/229.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/23.manifest b/.lancedb/nltk_chunking.lance/_versions/23.manifest new file mode 100644 index 0000000000000000000000000000000000000000..157a8a1938cab20598f9da46f974bcceb9004315 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/23.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/230.manifest b/.lancedb/nltk_chunking.lance/_versions/230.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1c9043454aa5e6baf13ccfd250528c9e01c92c2e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/230.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/231.manifest b/.lancedb/nltk_chunking.lance/_versions/231.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e9f315dc85c9835ea1e6dd4c9b2ed593ac5c4386 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/231.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/232.manifest b/.lancedb/nltk_chunking.lance/_versions/232.manifest new file mode 100644 index 0000000000000000000000000000000000000000..64b8322255b7a1eaa8b3fa3370d8e600543cb641 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/232.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/233.manifest b/.lancedb/nltk_chunking.lance/_versions/233.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ab59fa3a8a080e06c895e51eaf8608571b6a1d6a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/233.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/234.manifest b/.lancedb/nltk_chunking.lance/_versions/234.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9b5e86663716a7fb9adbb8481db08f914f469d94 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/234.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/235.manifest b/.lancedb/nltk_chunking.lance/_versions/235.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0bb6c5a8fad201a7915e904276e354fa4ef40b28 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/235.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/236.manifest b/.lancedb/nltk_chunking.lance/_versions/236.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fb700001059cb760fe1f1a28046a9f851855b82e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/236.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/237.manifest b/.lancedb/nltk_chunking.lance/_versions/237.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ced410f47cd874c81d6f69780c87ddd1820ca356 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/237.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/238.manifest b/.lancedb/nltk_chunking.lance/_versions/238.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f62dd6e2760df5c7f5d51abeec79b742e83d1eb5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/238.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/239.manifest b/.lancedb/nltk_chunking.lance/_versions/239.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4b311eb813fbc1879b35e452f1674e2d63364f11 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/239.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/24.manifest b/.lancedb/nltk_chunking.lance/_versions/24.manifest new file mode 100644 index 0000000000000000000000000000000000000000..827db47f003e23401dc88adc62f2bfff3e475238 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/24.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/240.manifest b/.lancedb/nltk_chunking.lance/_versions/240.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2a2da8ef73f57f1f461502dc840482fc85ddb2c7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/240.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/241.manifest b/.lancedb/nltk_chunking.lance/_versions/241.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9d708baf73e7f531cb5b2db96c84e0f047e0c88a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/241.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/242.manifest b/.lancedb/nltk_chunking.lance/_versions/242.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b4f7c210a4835fc216fdc4d44e4db343e8e6233b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/242.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/243.manifest b/.lancedb/nltk_chunking.lance/_versions/243.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a6e90f0a9bd4f51eb4973c04293ffdce7715d9f5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/243.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/244.manifest b/.lancedb/nltk_chunking.lance/_versions/244.manifest new file mode 100644 index 0000000000000000000000000000000000000000..09d27e165402ec2b62128068ae460f56cd083495 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/244.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/245.manifest b/.lancedb/nltk_chunking.lance/_versions/245.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5a6638128b5b0dc159a11a00a3eb2eafe677e666 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/245.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/246.manifest b/.lancedb/nltk_chunking.lance/_versions/246.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d591db9faaada41bd247db7d45668b3b9040e957 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/246.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/247.manifest b/.lancedb/nltk_chunking.lance/_versions/247.manifest new file mode 100644 index 0000000000000000000000000000000000000000..740bcadf9c00b82b41805a2a3fca30a344f543a2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/247.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/248.manifest b/.lancedb/nltk_chunking.lance/_versions/248.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e53fadef759a2907e30326bcfeda7b9e19d5c844 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/248.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/249.manifest b/.lancedb/nltk_chunking.lance/_versions/249.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b2b39adddd6bd0afaa2ee81293c26c71be049833 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/249.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/25.manifest b/.lancedb/nltk_chunking.lance/_versions/25.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fa70a13c6609f7290b5ae4976e5c91e92276512c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/25.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/250.manifest b/.lancedb/nltk_chunking.lance/_versions/250.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d978583e7023b7a3ffd9c705fdc0374be7195c33 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/250.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/251.manifest b/.lancedb/nltk_chunking.lance/_versions/251.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bee9955fcdbe15ef6cea22ced771d6580caa47b5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/251.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/252.manifest b/.lancedb/nltk_chunking.lance/_versions/252.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8fff24aa192a4454943f412d692135a3a8852df0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/252.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/253.manifest b/.lancedb/nltk_chunking.lance/_versions/253.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b5a4ebec2c3d78d4ccef56c5412c5f03e5141972 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/253.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/254.manifest b/.lancedb/nltk_chunking.lance/_versions/254.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bd205d93b8c701e2125252ec62d117202bf44b23 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/254.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/255.manifest b/.lancedb/nltk_chunking.lance/_versions/255.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d2d3599876345894789ecda9631522d7f04caf85 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/255.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/256.manifest b/.lancedb/nltk_chunking.lance/_versions/256.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dfa63b8e30121517005b7d14dbdd15f122e08217 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/256.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/257.manifest b/.lancedb/nltk_chunking.lance/_versions/257.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e432122ff8f85936cb5aba0f87b5b8cfc48e150a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/257.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/258.manifest b/.lancedb/nltk_chunking.lance/_versions/258.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ca50a1e3c5c922b795d6275c04e87c094a60c992 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/258.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/259.manifest b/.lancedb/nltk_chunking.lance/_versions/259.manifest new file mode 100644 index 0000000000000000000000000000000000000000..21c1727dde9673143dc0ef1cfdaad260b4510ef9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/259.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/26.manifest b/.lancedb/nltk_chunking.lance/_versions/26.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6dc251d0bd1bd815d6a0c7b45aa3a1f409b2e554 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/26.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/260.manifest b/.lancedb/nltk_chunking.lance/_versions/260.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4fa4d29168f30ce7f45e32810b0ee59de0efbb0a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/260.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/261.manifest b/.lancedb/nltk_chunking.lance/_versions/261.manifest new file mode 100644 index 0000000000000000000000000000000000000000..225f87dc735622c56e2e083d5ebf39c83752adef Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/261.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/262.manifest b/.lancedb/nltk_chunking.lance/_versions/262.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e7760675f0b37dab6ccdd2070f2f4396a5747933 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/262.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/263.manifest b/.lancedb/nltk_chunking.lance/_versions/263.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5f78e782d0355ff2bf4a59fe550dc49c394d396e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/263.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/264.manifest b/.lancedb/nltk_chunking.lance/_versions/264.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b74b89af23315b344d7e2e73c736ad39ccebb20e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/264.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/265.manifest b/.lancedb/nltk_chunking.lance/_versions/265.manifest new file mode 100644 index 0000000000000000000000000000000000000000..946fdeb8d05438bb5aeffd2a955082f35eaebb72 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/265.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/266.manifest b/.lancedb/nltk_chunking.lance/_versions/266.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8b3b429a9cd46ee293879513431ab3cb913d05fe Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/266.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/267.manifest b/.lancedb/nltk_chunking.lance/_versions/267.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3fed615ff0377985c41d18f85b817e4e088a0345 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/267.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/268.manifest b/.lancedb/nltk_chunking.lance/_versions/268.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c7e5502ad12f5ab836578023852c66e9a71dfb00 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/268.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/269.manifest b/.lancedb/nltk_chunking.lance/_versions/269.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d69228da7c3352031e1255f4f71a9a135780a9b0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/269.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/27.manifest b/.lancedb/nltk_chunking.lance/_versions/27.manifest new file mode 100644 index 0000000000000000000000000000000000000000..55485618f30808c8cc386ce8e84e6763f59b1ac3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/27.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/270.manifest b/.lancedb/nltk_chunking.lance/_versions/270.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b660488f13c5d2ef722c40d8f699b9c7b2c64c01 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/270.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/271.manifest b/.lancedb/nltk_chunking.lance/_versions/271.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7c57b8733dda753640a5a2d41a7b8ec22a5d365d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/271.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/272.manifest b/.lancedb/nltk_chunking.lance/_versions/272.manifest new file mode 100644 index 0000000000000000000000000000000000000000..698e875a53f7e7d764bc9bdb5c8ee8eb3e8891d0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/272.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/273.manifest b/.lancedb/nltk_chunking.lance/_versions/273.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1a25a51832a5a067045cd633cdd0e04a556a0f2c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/273.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/274.manifest b/.lancedb/nltk_chunking.lance/_versions/274.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dab8be78a719ae8890a446d8c82f9efba7b6b945 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/274.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/275.manifest b/.lancedb/nltk_chunking.lance/_versions/275.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2b8d811de7af4dcb575eae071dd7030d225f0d4d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/275.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/276.manifest b/.lancedb/nltk_chunking.lance/_versions/276.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ac4d1083908ac2d6ded5b976baa8bd5062d9e6d9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/276.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/277.manifest b/.lancedb/nltk_chunking.lance/_versions/277.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a46217bc4438ec6226837ecd6fe073fc7857dd0f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/277.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/278.manifest b/.lancedb/nltk_chunking.lance/_versions/278.manifest new file mode 100644 index 0000000000000000000000000000000000000000..274207ca40c385f7ef842b4661a5575c601f1d50 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/278.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/279.manifest b/.lancedb/nltk_chunking.lance/_versions/279.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c97c8eed7382b4c97b2da3b2cb59656775515f15 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/279.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/28.manifest b/.lancedb/nltk_chunking.lance/_versions/28.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d8162a60b5dfb9063f5083bce0cf90a7dcbaaa93 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/28.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/280.manifest b/.lancedb/nltk_chunking.lance/_versions/280.manifest new file mode 100644 index 0000000000000000000000000000000000000000..68c4583c78f825da85cf62dbbd18dea9ff2bf558 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/280.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/281.manifest b/.lancedb/nltk_chunking.lance/_versions/281.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a82591305711d227fd61de147f116ccf2c869584 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/281.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/282.manifest b/.lancedb/nltk_chunking.lance/_versions/282.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4d9f18039a4bdb3430eadae862c09d466f2f3349 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/282.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/283.manifest b/.lancedb/nltk_chunking.lance/_versions/283.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fdec1f0e99bca1aaca26be98a7003c02a102a8b4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/283.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/284.manifest b/.lancedb/nltk_chunking.lance/_versions/284.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9206ceba111a56674611e5e479ab2e3eef88b207 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/284.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/285.manifest b/.lancedb/nltk_chunking.lance/_versions/285.manifest new file mode 100644 index 0000000000000000000000000000000000000000..db4019ed337821baf3bb48dcf4a0fd28cc687b5a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/285.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/286.manifest b/.lancedb/nltk_chunking.lance/_versions/286.manifest new file mode 100644 index 0000000000000000000000000000000000000000..257b0d2971059b9a390cff8ff4f5102279db7259 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/286.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/287.manifest b/.lancedb/nltk_chunking.lance/_versions/287.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6d07ba6f3cbd66c0beea87147aeb323560ace05a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/287.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/288.manifest b/.lancedb/nltk_chunking.lance/_versions/288.manifest new file mode 100644 index 0000000000000000000000000000000000000000..14fc20eb98dd544e92b6b4ed1bfbc932bf3779af Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/288.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/289.manifest b/.lancedb/nltk_chunking.lance/_versions/289.manifest new file mode 100644 index 0000000000000000000000000000000000000000..24a87cca19977528638400ffd3218c80787df39f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/289.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/29.manifest b/.lancedb/nltk_chunking.lance/_versions/29.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8f623830844920d93f15661da4ae62cb7aad63a5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/29.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/290.manifest b/.lancedb/nltk_chunking.lance/_versions/290.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8433bf7711c76b522556300c00836e7f466633cb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/290.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/291.manifest b/.lancedb/nltk_chunking.lance/_versions/291.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2e86e1afb35e0e272183127f3c399c135ca39413 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/291.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/292.manifest b/.lancedb/nltk_chunking.lance/_versions/292.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d8f930d706a14945c331375240ec5be5031f2b3d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/292.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/293.manifest b/.lancedb/nltk_chunking.lance/_versions/293.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1ffbc12ec29fdbeafdbc2fff987d6a510b0d7f9a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/293.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/294.manifest b/.lancedb/nltk_chunking.lance/_versions/294.manifest new file mode 100644 index 0000000000000000000000000000000000000000..96dd49ac08f335b1ebabdba0f5f1eb847ad1529c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/294.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/295.manifest b/.lancedb/nltk_chunking.lance/_versions/295.manifest new file mode 100644 index 0000000000000000000000000000000000000000..217f226f0ee1abec340d2ce2929cbd679de2534a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/295.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/296.manifest b/.lancedb/nltk_chunking.lance/_versions/296.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cc755ac5517e5f4abb624c98dcdec313161e2d38 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/296.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/297.manifest b/.lancedb/nltk_chunking.lance/_versions/297.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d54402eafb0a4e88b2d4dc227a01849fb41f48a8 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/297.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/298.manifest b/.lancedb/nltk_chunking.lance/_versions/298.manifest new file mode 100644 index 0000000000000000000000000000000000000000..02e9d78a875ab2d954233f44214b89ce95931621 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/298.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/299.manifest b/.lancedb/nltk_chunking.lance/_versions/299.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4b4c10c54ff0c86704cae680b910c292c360ebac Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/299.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/3.manifest b/.lancedb/nltk_chunking.lance/_versions/3.manifest new file mode 100644 index 0000000000000000000000000000000000000000..879550c3fb353a7140fc947457306e6b95a82818 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/3.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/30.manifest b/.lancedb/nltk_chunking.lance/_versions/30.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e8ac67221e54badb66db2203832f7d58710584cf Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/30.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/300.manifest b/.lancedb/nltk_chunking.lance/_versions/300.manifest new file mode 100644 index 0000000000000000000000000000000000000000..734c661707d0ebeaec0675e0deedeb9596f8e5e9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/300.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/301.manifest b/.lancedb/nltk_chunking.lance/_versions/301.manifest new file mode 100644 index 0000000000000000000000000000000000000000..848380274b744ae9e55262e61db52546d061b86c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/301.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/302.manifest b/.lancedb/nltk_chunking.lance/_versions/302.manifest new file mode 100644 index 0000000000000000000000000000000000000000..24baaf4e36ab13db6bf4bc3f5833ba95665525f0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/302.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/303.manifest b/.lancedb/nltk_chunking.lance/_versions/303.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bdda31bdc32101f8b62de1067811b79d9b43f8df Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/303.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/304.manifest b/.lancedb/nltk_chunking.lance/_versions/304.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f5194d4f928585a92761bf60fad0c0a3fcfbcb38 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/304.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/305.manifest b/.lancedb/nltk_chunking.lance/_versions/305.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3e79631ff3d62954a97a0df6faff7755e6accb43 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/305.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/306.manifest b/.lancedb/nltk_chunking.lance/_versions/306.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0ccdd65f3f43c7374824f0c9d2f9c27430db682b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/306.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/307.manifest b/.lancedb/nltk_chunking.lance/_versions/307.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f24d2465175ca91f2d66d4723f187cfeeed3a536 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/307.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/308.manifest b/.lancedb/nltk_chunking.lance/_versions/308.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4feaf55bb4c5ec4c678f77c12cfd64a3a5d33934 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/308.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/309.manifest b/.lancedb/nltk_chunking.lance/_versions/309.manifest new file mode 100644 index 0000000000000000000000000000000000000000..281d43de90a8e740a35ad02c6f7b3fd4e130a9a4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/309.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/31.manifest b/.lancedb/nltk_chunking.lance/_versions/31.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1a45f61f100c44aeb8868b94d7b6f8cfcb872f3f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/31.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/310.manifest b/.lancedb/nltk_chunking.lance/_versions/310.manifest new file mode 100644 index 0000000000000000000000000000000000000000..44f440fa36204310822356a09d51165349ac09d4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/310.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/311.manifest b/.lancedb/nltk_chunking.lance/_versions/311.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8e7496877e533b487f3c94cbe3d2f6acc60d8d86 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/311.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/312.manifest b/.lancedb/nltk_chunking.lance/_versions/312.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ba45a5a2983736773ebe329dc18272216ac817ff Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/312.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/313.manifest b/.lancedb/nltk_chunking.lance/_versions/313.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c234cda1357d5a6ac6c85b5e422a561e6a540026 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/313.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/314.manifest b/.lancedb/nltk_chunking.lance/_versions/314.manifest new file mode 100644 index 0000000000000000000000000000000000000000..919c8962e90f8036ad04eccf9fe94996da535c6d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/314.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/315.manifest b/.lancedb/nltk_chunking.lance/_versions/315.manifest new file mode 100644 index 0000000000000000000000000000000000000000..00917a71e7f14cec38aa646e11d39360b857ef0e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/315.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/316.manifest b/.lancedb/nltk_chunking.lance/_versions/316.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4e9bca7f1f590ea77dad49a7e47a148a30471302 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/316.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/317.manifest b/.lancedb/nltk_chunking.lance/_versions/317.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7a80944a845e7e625dd14c6aec74e7979604a977 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/317.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/318.manifest b/.lancedb/nltk_chunking.lance/_versions/318.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ebf6bca702b09c8878796988c020ebba38920186 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/318.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/319.manifest b/.lancedb/nltk_chunking.lance/_versions/319.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3d4e1e1d52ebfa26fcfb7f061613f7d439326b72 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/319.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/32.manifest b/.lancedb/nltk_chunking.lance/_versions/32.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8b64bdea9da7a3be38365f9f78fb9362cae0f444 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/32.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/320.manifest b/.lancedb/nltk_chunking.lance/_versions/320.manifest new file mode 100644 index 0000000000000000000000000000000000000000..086ae6b18aeac6bfa7d1398827b52e87b6b46e3f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/320.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/321.manifest b/.lancedb/nltk_chunking.lance/_versions/321.manifest new file mode 100644 index 0000000000000000000000000000000000000000..66a8d6d23a80e74db52cef495130c1f6c5c49cf0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/321.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/322.manifest b/.lancedb/nltk_chunking.lance/_versions/322.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fa44acef7831b01c839562045ba4c6dfa8f50a67 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/322.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/323.manifest b/.lancedb/nltk_chunking.lance/_versions/323.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d7e41c78ea562695695043ca545a20577f4e7e12 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/323.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/324.manifest b/.lancedb/nltk_chunking.lance/_versions/324.manifest new file mode 100644 index 0000000000000000000000000000000000000000..82e8721892dd2715850d74bf98929b5d6dab33af Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/324.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/325.manifest b/.lancedb/nltk_chunking.lance/_versions/325.manifest new file mode 100644 index 0000000000000000000000000000000000000000..db9aa0a52e4cdc344a6ae0a73771e82e0aea0606 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/325.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/326.manifest b/.lancedb/nltk_chunking.lance/_versions/326.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c506aa6258f61e6611bbf26ebe898b5727ebbbd7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/326.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/327.manifest b/.lancedb/nltk_chunking.lance/_versions/327.manifest new file mode 100644 index 0000000000000000000000000000000000000000..06f6ecfd6664f75a9009a64590fc5de7e2f4f702 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/327.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/328.manifest b/.lancedb/nltk_chunking.lance/_versions/328.manifest new file mode 100644 index 0000000000000000000000000000000000000000..36de0f116f0f1b69993280bf795877abc49240f1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/328.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/329.manifest b/.lancedb/nltk_chunking.lance/_versions/329.manifest new file mode 100644 index 0000000000000000000000000000000000000000..eb732443f27ff16e6e3e475240bb1e4a9e00b6d3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/329.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/33.manifest b/.lancedb/nltk_chunking.lance/_versions/33.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e279111f68200dbdfe7a124aeb7d17ddee46c9cc Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/33.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/330.manifest b/.lancedb/nltk_chunking.lance/_versions/330.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a79f74a84f9ad968bcb0826466e45829184c73d9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/330.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/331.manifest b/.lancedb/nltk_chunking.lance/_versions/331.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bfb18117f83cb47dd80b06ed72966db2af6ee392 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/331.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/332.manifest b/.lancedb/nltk_chunking.lance/_versions/332.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1634279912185ab02c3def5209c348b88b9decb4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/332.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/333.manifest b/.lancedb/nltk_chunking.lance/_versions/333.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1872c536feffd62efae317676503b7b05b18c37a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/333.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/334.manifest b/.lancedb/nltk_chunking.lance/_versions/334.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1678ee997fd36516f321ae9a2804747a037df877 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/334.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/335.manifest b/.lancedb/nltk_chunking.lance/_versions/335.manifest new file mode 100644 index 0000000000000000000000000000000000000000..da723ceda224912c352f369c063c1d26cc355bc2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/335.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/336.manifest b/.lancedb/nltk_chunking.lance/_versions/336.manifest new file mode 100644 index 0000000000000000000000000000000000000000..19b231db8b9c2a5b76a75d0bba521f6efe78c227 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/336.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/337.manifest b/.lancedb/nltk_chunking.lance/_versions/337.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8e6ceb673ee2492c0ed37f1c46e50d0cdc342bc6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/337.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/338.manifest b/.lancedb/nltk_chunking.lance/_versions/338.manifest new file mode 100644 index 0000000000000000000000000000000000000000..04c6f97c4eb857fbff1e123ad937ac9128d345c9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/338.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/339.manifest b/.lancedb/nltk_chunking.lance/_versions/339.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c91e8ae7d5d617e0234c742b45ef4cb038b9b6b5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/339.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/34.manifest b/.lancedb/nltk_chunking.lance/_versions/34.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5885eb21c4a82b7297bd981de991c46272d68df4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/34.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/340.manifest b/.lancedb/nltk_chunking.lance/_versions/340.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ac434555486d4d38272f05264b56a6ea415e9526 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/340.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/341.manifest b/.lancedb/nltk_chunking.lance/_versions/341.manifest new file mode 100644 index 0000000000000000000000000000000000000000..efaf63f44fd3d5fdd6279699496b889bc4b1c794 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/341.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/342.manifest b/.lancedb/nltk_chunking.lance/_versions/342.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5e1cce2f222410220337b0b751b31339e795cc51 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/342.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/343.manifest b/.lancedb/nltk_chunking.lance/_versions/343.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e5bdbd25cff56f9174c11b4af2ba6643786484d3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/343.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/344.manifest b/.lancedb/nltk_chunking.lance/_versions/344.manifest new file mode 100644 index 0000000000000000000000000000000000000000..36d5e6aa89d485b5ff949efc539bd3c57f858355 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/344.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/345.manifest b/.lancedb/nltk_chunking.lance/_versions/345.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ae4d53b9ed3bd994dde2fb60e148fcb1b24cf5c7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/345.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/346.manifest b/.lancedb/nltk_chunking.lance/_versions/346.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dd806b9ee5e9ad05246c431026b9cd2536af9a22 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/346.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/347.manifest b/.lancedb/nltk_chunking.lance/_versions/347.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2724e356307de0810cd2e4dca8876a2d1aa0e500 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/347.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/348.manifest b/.lancedb/nltk_chunking.lance/_versions/348.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4ee4c0be1b9a1ad23bef1a279ad7f1fe94aa361d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/348.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/349.manifest b/.lancedb/nltk_chunking.lance/_versions/349.manifest new file mode 100644 index 0000000000000000000000000000000000000000..02598ac1ec7319f8158d4b435ad2efe118626ce2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/349.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/35.manifest b/.lancedb/nltk_chunking.lance/_versions/35.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8ee97006ca055c37ea2fbb19224912fb5449bada Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/35.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/350.manifest b/.lancedb/nltk_chunking.lance/_versions/350.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b1bf3906f12d7813f24cff9c00c4185b0817d08a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/350.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/351.manifest b/.lancedb/nltk_chunking.lance/_versions/351.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6a1ef96c7890a12b26bee6dc0adcf7bc532ae52c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/351.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/352.manifest b/.lancedb/nltk_chunking.lance/_versions/352.manifest new file mode 100644 index 0000000000000000000000000000000000000000..42bbc7fb2b9254d179d7c37f35f7dd72e8b23c40 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/352.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/353.manifest b/.lancedb/nltk_chunking.lance/_versions/353.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dca20eef82c4d6876a7555c7bb43026ef49cb285 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/353.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/354.manifest b/.lancedb/nltk_chunking.lance/_versions/354.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fd29bacde9c9dd8a0f908359de8ac342d4147131 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/354.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/355.manifest b/.lancedb/nltk_chunking.lance/_versions/355.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1fa24fbc8f6ca2b059b4147259e022b019e886a3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/355.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/356.manifest b/.lancedb/nltk_chunking.lance/_versions/356.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9f91a71c0e2cf7218c55d70db2f1fa6d077773db Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/356.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/357.manifest b/.lancedb/nltk_chunking.lance/_versions/357.manifest new file mode 100644 index 0000000000000000000000000000000000000000..402d8b4b989cf504afb40604fcdd6a4ae7a9f63a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/357.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/358.manifest b/.lancedb/nltk_chunking.lance/_versions/358.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9380d175f4acd2fcfd925541bb794c3eeccd0ac6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/358.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/359.manifest b/.lancedb/nltk_chunking.lance/_versions/359.manifest new file mode 100644 index 0000000000000000000000000000000000000000..defde6341c38a2a802eacbe1d8f5802e748a38ef Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/359.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/36.manifest b/.lancedb/nltk_chunking.lance/_versions/36.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f0125a308b27515de14119b8aa1d13b86eadfb4d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/36.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/360.manifest b/.lancedb/nltk_chunking.lance/_versions/360.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2d2629bdfbac5e508fbf34d47922f27b2c27ea2d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/360.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/361.manifest b/.lancedb/nltk_chunking.lance/_versions/361.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9e9c8732a441986f85b639baf32b59791f59bf85 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/361.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/362.manifest b/.lancedb/nltk_chunking.lance/_versions/362.manifest new file mode 100644 index 0000000000000000000000000000000000000000..60595c45837085c557e7dc052dadc3f1aba8fc4b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/362.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/363.manifest b/.lancedb/nltk_chunking.lance/_versions/363.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b9e3ace14113dbb2a7df8fe63f71af1e9feba5f7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/363.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/364.manifest b/.lancedb/nltk_chunking.lance/_versions/364.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cf87daac541f3bcf3d0cb35aa12401f160e88286 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/364.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/365.manifest b/.lancedb/nltk_chunking.lance/_versions/365.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7926ca7692a8e7b8895e75b8c0218ce09086787d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/365.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/366.manifest b/.lancedb/nltk_chunking.lance/_versions/366.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b95081734a81e57ce993c7ced77936af6b809ee5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/366.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/367.manifest b/.lancedb/nltk_chunking.lance/_versions/367.manifest new file mode 100644 index 0000000000000000000000000000000000000000..caed6685a53b7974772dbf9002863c942ac48cc8 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/367.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/368.manifest b/.lancedb/nltk_chunking.lance/_versions/368.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4fa7e968c6a28e6c7b9e387aa66d4165305b8238 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/368.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/369.manifest b/.lancedb/nltk_chunking.lance/_versions/369.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9e26823cff5e7e86cc553f430522c7995772c889 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/369.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/37.manifest b/.lancedb/nltk_chunking.lance/_versions/37.manifest new file mode 100644 index 0000000000000000000000000000000000000000..04da8f8f93e0bf3a0046fe7c687bfbbad4514058 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/37.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/370.manifest b/.lancedb/nltk_chunking.lance/_versions/370.manifest new file mode 100644 index 0000000000000000000000000000000000000000..37bc1700cdc91d35584f9adc1d41ab1250018ac6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/370.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/371.manifest b/.lancedb/nltk_chunking.lance/_versions/371.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1a0ab48dc2d44d7fa3e237179a0bafe296036b42 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/371.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/372.manifest b/.lancedb/nltk_chunking.lance/_versions/372.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9800ffd2931995f77644f4790f498386cc6915ac Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/372.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/373.manifest b/.lancedb/nltk_chunking.lance/_versions/373.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1e1b5c20146cc541ab7f38c2229750502c2ab36f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/373.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/374.manifest b/.lancedb/nltk_chunking.lance/_versions/374.manifest new file mode 100644 index 0000000000000000000000000000000000000000..396b373c5351e907a7b41baf3ea3b98137914bd5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/374.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/375.manifest b/.lancedb/nltk_chunking.lance/_versions/375.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8fd3ec383aa98288a43dfed64618e30a00e678d7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/375.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/376.manifest b/.lancedb/nltk_chunking.lance/_versions/376.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cade9a39e508b7af46c92f8fcb35bf1d98dd987c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/376.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/377.manifest b/.lancedb/nltk_chunking.lance/_versions/377.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a896abd118b69d711cb8ca57650ab9e584f4fe9d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/377.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/378.manifest b/.lancedb/nltk_chunking.lance/_versions/378.manifest new file mode 100644 index 0000000000000000000000000000000000000000..617719e1d3e15e81979da91782e4aa6e1881134e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/378.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/379.manifest b/.lancedb/nltk_chunking.lance/_versions/379.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1b697502c886991f09e5a81079f7a8c9a64f1ca2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/379.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/38.manifest b/.lancedb/nltk_chunking.lance/_versions/38.manifest new file mode 100644 index 0000000000000000000000000000000000000000..872a22cc7dcfc98b05319ba8b250fdb91ce09f5f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/38.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/380.manifest b/.lancedb/nltk_chunking.lance/_versions/380.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b9b8a8b715aa19e2f805f0a4aa8b800522e2e38a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/380.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/381.manifest b/.lancedb/nltk_chunking.lance/_versions/381.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9e4b2b940930233730c29e2cc982541d977a4684 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/381.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/382.manifest b/.lancedb/nltk_chunking.lance/_versions/382.manifest new file mode 100644 index 0000000000000000000000000000000000000000..541de2ea0377a8ead493e5fc6231f88be3d1dbe8 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/382.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/383.manifest b/.lancedb/nltk_chunking.lance/_versions/383.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c65a2d81bda88ca818d6d5dd47bdcfc5eb72bc75 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/383.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/384.manifest b/.lancedb/nltk_chunking.lance/_versions/384.manifest new file mode 100644 index 0000000000000000000000000000000000000000..325ff7036fbed516fd3497daf535d45d1ddddde1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/384.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/385.manifest b/.lancedb/nltk_chunking.lance/_versions/385.manifest new file mode 100644 index 0000000000000000000000000000000000000000..36b8cf311831a6d64c11e028ef677d2af8966c0a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/385.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/386.manifest b/.lancedb/nltk_chunking.lance/_versions/386.manifest new file mode 100644 index 0000000000000000000000000000000000000000..597787e5ee619a6deb5d175109bd139b556ffaf0 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/386.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/387.manifest b/.lancedb/nltk_chunking.lance/_versions/387.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dd5e146ddb7be1c553ec6974e0f9248e78e9d120 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/387.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/388.manifest b/.lancedb/nltk_chunking.lance/_versions/388.manifest new file mode 100644 index 0000000000000000000000000000000000000000..eb032d8de2001f280f7fe7e0427a13aa4441880f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/388.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/389.manifest b/.lancedb/nltk_chunking.lance/_versions/389.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f33bcf2e47ec3ffe2f5adcd119ccebff52a0216a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/389.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/39.manifest b/.lancedb/nltk_chunking.lance/_versions/39.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fb8af2a439bdfcac702b1406861c52414b7a04b6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/39.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/390.manifest b/.lancedb/nltk_chunking.lance/_versions/390.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cb108a9dd6d5243b938a2c39402b7b5228a7937a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/390.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/391.manifest b/.lancedb/nltk_chunking.lance/_versions/391.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5e730424e7491681d0ffabb7d7c875dd45e9022c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/391.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/392.manifest b/.lancedb/nltk_chunking.lance/_versions/392.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ad2291e2c3bf30c1c0dbc57fd4991a09fd3a3e9f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/392.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/393.manifest b/.lancedb/nltk_chunking.lance/_versions/393.manifest new file mode 100644 index 0000000000000000000000000000000000000000..980126c2479ce6c6912290d723405d7e8906ba12 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/393.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/394.manifest b/.lancedb/nltk_chunking.lance/_versions/394.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fc80421ac77269f38e43751983669cf62ba8fe99 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/394.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/395.manifest b/.lancedb/nltk_chunking.lance/_versions/395.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1ab9f46e5fb9fb4204de7e05c7d4d0c1ddcf6d1c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/395.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/396.manifest b/.lancedb/nltk_chunking.lance/_versions/396.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fbe967f77a3b9e6293167b2e99f46a25abd86197 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/396.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/397.manifest b/.lancedb/nltk_chunking.lance/_versions/397.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d03cb49b809f396b0ffb42e924f09a118ec863dd Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/397.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/398.manifest b/.lancedb/nltk_chunking.lance/_versions/398.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5dfa529d6ff071c0c10f0f00f8cbc15911555536 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/398.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/4.manifest b/.lancedb/nltk_chunking.lance/_versions/4.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4406a32c903e5ca768b472d2d7d4a2ae1716d69d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/4.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/40.manifest b/.lancedb/nltk_chunking.lance/_versions/40.manifest new file mode 100644 index 0000000000000000000000000000000000000000..311cc7954c413afd7e8563032a1bd14de13918e3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/40.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/41.manifest b/.lancedb/nltk_chunking.lance/_versions/41.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7d87c51c2e086f05b0584816745483bd70910fde Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/41.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/42.manifest b/.lancedb/nltk_chunking.lance/_versions/42.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ce741443502a8153434e0e3b6e6dbec77432379c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/42.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/43.manifest b/.lancedb/nltk_chunking.lance/_versions/43.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f29a52e5f0e55454bc5002ecc90c9086ae725568 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/43.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/44.manifest b/.lancedb/nltk_chunking.lance/_versions/44.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2b15e23d6dfd132b3b82a7b11a2bcee75526a07d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/44.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/45.manifest b/.lancedb/nltk_chunking.lance/_versions/45.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a89f77fb3603130cd3f30027345e6d29dcdae78c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/45.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/46.manifest b/.lancedb/nltk_chunking.lance/_versions/46.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a060fb414676938be5ae2f065993071d597e6494 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/46.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/47.manifest b/.lancedb/nltk_chunking.lance/_versions/47.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3ec598e62b861c0be54d14c241021fb420d92d4a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/47.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/48.manifest b/.lancedb/nltk_chunking.lance/_versions/48.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f5d7358f2e351349d9173b9815510665405e934b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/48.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/49.manifest b/.lancedb/nltk_chunking.lance/_versions/49.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5455909f21c98b0107560ecdfe35b64d33b7628a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/49.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/5.manifest b/.lancedb/nltk_chunking.lance/_versions/5.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c01fbbe48867597cc716ae54ec596f68b7899df1 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/5.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/50.manifest b/.lancedb/nltk_chunking.lance/_versions/50.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6336e16458d5375ed249cf7953a664948d3379f5 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/50.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/51.manifest b/.lancedb/nltk_chunking.lance/_versions/51.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e069e7c04f2f078425acbda46fdba8f8387beb7f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/51.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/52.manifest b/.lancedb/nltk_chunking.lance/_versions/52.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e2087b9989aa2177713f8a9b7a437cf9ace3bfb6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/52.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/53.manifest b/.lancedb/nltk_chunking.lance/_versions/53.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ec7862c2ea655d1fcb2c7705248c6013cd16c533 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/53.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/54.manifest b/.lancedb/nltk_chunking.lance/_versions/54.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7d372819cde3b2edc098b9384c3551640883d005 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/54.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/55.manifest b/.lancedb/nltk_chunking.lance/_versions/55.manifest new file mode 100644 index 0000000000000000000000000000000000000000..73c33844993112ff3cb321be1b6940cbb85e6f2e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/55.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/56.manifest b/.lancedb/nltk_chunking.lance/_versions/56.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6244ad45676b43772667b28b9e6386e9f0e9ec6e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/56.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/57.manifest b/.lancedb/nltk_chunking.lance/_versions/57.manifest new file mode 100644 index 0000000000000000000000000000000000000000..98af59ed69db9b2b5ed9a3a02aacc8ba909a7787 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/57.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/58.manifest b/.lancedb/nltk_chunking.lance/_versions/58.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a524612f34b618004d47a11137493e67a15e93f2 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/58.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/59.manifest b/.lancedb/nltk_chunking.lance/_versions/59.manifest new file mode 100644 index 0000000000000000000000000000000000000000..712f8cf5585bddf6a814be1a3650d3e48a619d35 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/59.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/6.manifest b/.lancedb/nltk_chunking.lance/_versions/6.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4ad00cd775e6bcb6b6ef2643cd19434308c7a65f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/6.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/60.manifest b/.lancedb/nltk_chunking.lance/_versions/60.manifest new file mode 100644 index 0000000000000000000000000000000000000000..aa0f55cb084a20edcbd4e87de0b940b7535416cf Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/60.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/61.manifest b/.lancedb/nltk_chunking.lance/_versions/61.manifest new file mode 100644 index 0000000000000000000000000000000000000000..250886f09a4205216a18461c010224a34b022f12 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/61.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/62.manifest b/.lancedb/nltk_chunking.lance/_versions/62.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3e0ad9a93aa096f466bb217df8940c9decb77d3f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/62.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/63.manifest b/.lancedb/nltk_chunking.lance/_versions/63.manifest new file mode 100644 index 0000000000000000000000000000000000000000..eb70967577c4edcdefd37d0a49ad5c42dda724a3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/63.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/64.manifest b/.lancedb/nltk_chunking.lance/_versions/64.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ec7ffc1dcc989ae8b4d05d63607a93fe1bd8d991 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/64.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/65.manifest b/.lancedb/nltk_chunking.lance/_versions/65.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3793a4bca4ea21840310f1f6277b91947230d3c3 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/65.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/66.manifest b/.lancedb/nltk_chunking.lance/_versions/66.manifest new file mode 100644 index 0000000000000000000000000000000000000000..67ce17fe2fcc051073e607fb35efe2fa0855651d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/66.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/67.manifest b/.lancedb/nltk_chunking.lance/_versions/67.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f7e36e3bb4925e4c88294ae53b127b66becf4016 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/67.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/68.manifest b/.lancedb/nltk_chunking.lance/_versions/68.manifest new file mode 100644 index 0000000000000000000000000000000000000000..56487fb3408b62b6c747762cd852818564ebc94b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/68.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/69.manifest b/.lancedb/nltk_chunking.lance/_versions/69.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fc9787659ee02f36c981bd8e8ddafad2834e2808 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/69.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/7.manifest b/.lancedb/nltk_chunking.lance/_versions/7.manifest new file mode 100644 index 0000000000000000000000000000000000000000..35535e7886105f15528af0f529689b851d121d8d Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/7.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/70.manifest b/.lancedb/nltk_chunking.lance/_versions/70.manifest new file mode 100644 index 0000000000000000000000000000000000000000..817e3f8851867557cd1726faa5dee7117e22c5e8 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/70.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/71.manifest b/.lancedb/nltk_chunking.lance/_versions/71.manifest new file mode 100644 index 0000000000000000000000000000000000000000..703c20efb12754f03366cdd45c2ab7589bfffd36 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/71.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/72.manifest b/.lancedb/nltk_chunking.lance/_versions/72.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fd5e842f88548a15fadd9d1adf36a7311def4447 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/72.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/73.manifest b/.lancedb/nltk_chunking.lance/_versions/73.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9662895d3cd7ced2a2ef8ccabeead2a0fa32fd76 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/73.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/74.manifest b/.lancedb/nltk_chunking.lance/_versions/74.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a106de83eeafc3dadeb96c8ade6d616abc914a8c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/74.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/75.manifest b/.lancedb/nltk_chunking.lance/_versions/75.manifest new file mode 100644 index 0000000000000000000000000000000000000000..893869960064f3ad8a1d0e5f1ced2cc504787db7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/75.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/76.manifest b/.lancedb/nltk_chunking.lance/_versions/76.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9fba53b67493c815e0c56b2857b163305126f429 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/76.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/77.manifest b/.lancedb/nltk_chunking.lance/_versions/77.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b21bc1f30b0ad32d1ef251c64b02a0c1fa6030bb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/77.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/78.manifest b/.lancedb/nltk_chunking.lance/_versions/78.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0932be299d64555de67139ac748a1941e973056a Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/78.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/79.manifest b/.lancedb/nltk_chunking.lance/_versions/79.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e5079480752b82d9f7cf543a718121eb8adbd58c Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/79.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/8.manifest b/.lancedb/nltk_chunking.lance/_versions/8.manifest new file mode 100644 index 0000000000000000000000000000000000000000..396f9b3e09be6b7fdff76f0b47aef66d2ff06678 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/8.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/80.manifest b/.lancedb/nltk_chunking.lance/_versions/80.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7012c0c8af4231829b4ac08ff13970583ff88f25 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/80.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/81.manifest b/.lancedb/nltk_chunking.lance/_versions/81.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0d37b94821072681a2b061e3d5e7b82889b07be9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/81.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/82.manifest b/.lancedb/nltk_chunking.lance/_versions/82.manifest new file mode 100644 index 0000000000000000000000000000000000000000..74598bf579ed09ba2d9876bd771629c9a63a58b6 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/82.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/83.manifest b/.lancedb/nltk_chunking.lance/_versions/83.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5439eb3d67c145e3b3aba090666d6d11d473c73e Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/83.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/84.manifest b/.lancedb/nltk_chunking.lance/_versions/84.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d0e77e0c9996373e654f5e0374849519e1e0285b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/84.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/85.manifest b/.lancedb/nltk_chunking.lance/_versions/85.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f200a23afdb683b4a6b5bc8deb7975bef2de693b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/85.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/86.manifest b/.lancedb/nltk_chunking.lance/_versions/86.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9f950dd80dd9aaaa630f1b4b9fff19cfbcba5b8b Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/86.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/87.manifest b/.lancedb/nltk_chunking.lance/_versions/87.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1bd90311edf4a92a69343f714ae1194163773cac Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/87.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/88.manifest b/.lancedb/nltk_chunking.lance/_versions/88.manifest new file mode 100644 index 0000000000000000000000000000000000000000..106bca27990a37201eead4ba1f5771876583b8c7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/88.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/89.manifest b/.lancedb/nltk_chunking.lance/_versions/89.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0510738d0f235f1ed1c69b300a98a3988c7780f7 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/89.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/9.manifest b/.lancedb/nltk_chunking.lance/_versions/9.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8bbe063399c71c4d510dc027178b7f584e06b60f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/9.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/90.manifest b/.lancedb/nltk_chunking.lance/_versions/90.manifest new file mode 100644 index 0000000000000000000000000000000000000000..23c57ab6cb4eecf20bd9869b609b81a6e4129597 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/90.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/91.manifest b/.lancedb/nltk_chunking.lance/_versions/91.manifest new file mode 100644 index 0000000000000000000000000000000000000000..61f09576234d0e82a0ee75361231bb1fb6763a59 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/91.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/92.manifest b/.lancedb/nltk_chunking.lance/_versions/92.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bd618ac01e1dccc05721b035b9d8c3fd44783b71 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/92.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/93.manifest b/.lancedb/nltk_chunking.lance/_versions/93.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c65ade0bdeb77b136fb54d39451cec80ff9f3434 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/93.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/94.manifest b/.lancedb/nltk_chunking.lance/_versions/94.manifest new file mode 100644 index 0000000000000000000000000000000000000000..721ad119e06f65fece4b345ba389c0c7e3eb9299 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/94.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/95.manifest b/.lancedb/nltk_chunking.lance/_versions/95.manifest new file mode 100644 index 0000000000000000000000000000000000000000..698b1b5fa23c4ad006d92eb97c931aaa3bed700f Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/95.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/96.manifest b/.lancedb/nltk_chunking.lance/_versions/96.manifest new file mode 100644 index 0000000000000000000000000000000000000000..63c08fbb0b7e23880f1ea9011ea50427ad7bdcc4 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/96.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/97.manifest b/.lancedb/nltk_chunking.lance/_versions/97.manifest new file mode 100644 index 0000000000000000000000000000000000000000..43658d83124c6adf969f4ab8d148795c7e7d25dd Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/97.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/98.manifest b/.lancedb/nltk_chunking.lance/_versions/98.manifest new file mode 100644 index 0000000000000000000000000000000000000000..085737f9b0302d80a277462aa1bca9c85e05a3d9 Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/98.manifest differ diff --git a/.lancedb/nltk_chunking.lance/_versions/99.manifest b/.lancedb/nltk_chunking.lance/_versions/99.manifest new file mode 100644 index 0000000000000000000000000000000000000000..da9472d6bf9d3e2e80bd8772cd5d7467f95606eb Binary files /dev/null and b/.lancedb/nltk_chunking.lance/_versions/99.manifest differ diff --git a/.lancedb/nltk_chunking.lance/data/00550b2e-33ae-4add-b1d7-73cac211b543.lance b/.lancedb/nltk_chunking.lance/data/00550b2e-33ae-4add-b1d7-73cac211b543.lance new file mode 100644 index 0000000000000000000000000000000000000000..4a2fccedd7457bd86bbfcb0e3f0a6269bee891ae --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/00550b2e-33ae-4add-b1d7-73cac211b543.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:766a631296bd384fd8cda6e3eecd4691348e873501de24d0aecd3bf312ef3c65 +size 54606 diff --git a/.lancedb/nltk_chunking.lance/data/00a3e3f8-5b8b-415a-b256-9f2cbcee3599.lance b/.lancedb/nltk_chunking.lance/data/00a3e3f8-5b8b-415a-b256-9f2cbcee3599.lance new file mode 100644 index 0000000000000000000000000000000000000000..c8550cfbdd091acab64bdb66db400b53a22f7606 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/00a3e3f8-5b8b-415a-b256-9f2cbcee3599.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dcdb6daddc61deac584e64f62f11f0ddcdd9889e85ec5b81ac3a796ccbb9fdf +size 57007 diff --git a/.lancedb/nltk_chunking.lance/data/01a59ef6-6627-4d9a-b002-5d51f35ec296.lance b/.lancedb/nltk_chunking.lance/data/01a59ef6-6627-4d9a-b002-5d51f35ec296.lance new file mode 100644 index 0000000000000000000000000000000000000000..27f1dbe970617b7f75d03ebbaa51c9f5596f3eb1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/01a59ef6-6627-4d9a-b002-5d51f35ec296.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2ccf0c5f6809f423f2d9cbe09a72f9dec05ca6e3958614f949d0b35524c8e7b +size 55327 diff --git a/.lancedb/nltk_chunking.lance/data/0376bafc-8c98-4d66-9fca-b209314e487e.lance b/.lancedb/nltk_chunking.lance/data/0376bafc-8c98-4d66-9fca-b209314e487e.lance new file mode 100644 index 0000000000000000000000000000000000000000..16ce6795faf5d1b1ca82bcd4b956e0b91f74b985 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0376bafc-8c98-4d66-9fca-b209314e487e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1e54b3974f3f4d247abb3799ac62f13fbe2fb2bf94933d29716304d34108dd6 +size 55832 diff --git a/.lancedb/nltk_chunking.lance/data/04f03302-44e7-4019-811f-a454c837e565.lance b/.lancedb/nltk_chunking.lance/data/04f03302-44e7-4019-811f-a454c837e565.lance new file mode 100644 index 0000000000000000000000000000000000000000..0df57b5fb85d99898609888d3f440dd915ae5908 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/04f03302-44e7-4019-811f-a454c837e565.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b1e550d41ff6394abcfb98a41912414abc5c9ae2a64d9a69c908a7270319713 +size 57164 diff --git a/.lancedb/nltk_chunking.lance/data/05c03bac-a65c-4d73-81a8-451588afb9ae.lance b/.lancedb/nltk_chunking.lance/data/05c03bac-a65c-4d73-81a8-451588afb9ae.lance new file mode 100644 index 0000000000000000000000000000000000000000..10534386f358525095ce68392ffcd3cee37e48bb --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/05c03bac-a65c-4d73-81a8-451588afb9ae.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a3c1424aeb0e01e267027ba05391d3ff262b1d58af5d11d0ecdf7063c1315fa +size 55222 diff --git a/.lancedb/nltk_chunking.lance/data/072efee1-8bf8-4be3-a703-5713b5d01427.lance b/.lancedb/nltk_chunking.lance/data/072efee1-8bf8-4be3-a703-5713b5d01427.lance new file mode 100644 index 0000000000000000000000000000000000000000..dc4d7c4233da33ad41c9222686f8f8cf099cfd36 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/072efee1-8bf8-4be3-a703-5713b5d01427.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23a5da5bef21e41ca5ef7ef4db33b55aff98a3a0bd88a8b4cbea2c2c24310b7c +size 55023 diff --git a/.lancedb/nltk_chunking.lance/data/080427ba-d334-43f0-b492-8e893f2ae8eb.lance b/.lancedb/nltk_chunking.lance/data/080427ba-d334-43f0-b492-8e893f2ae8eb.lance new file mode 100644 index 0000000000000000000000000000000000000000..f11845896987748f35bf59b4e109a98ecdd8d80c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/080427ba-d334-43f0-b492-8e893f2ae8eb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a3055786d20aa0fc250c5707908637f4963b404ed2ed952cae45ef3f9592d99 +size 55372 diff --git a/.lancedb/nltk_chunking.lance/data/0838c030-603b-4984-89ba-a31e41ca17ee.lance b/.lancedb/nltk_chunking.lance/data/0838c030-603b-4984-89ba-a31e41ca17ee.lance new file mode 100644 index 0000000000000000000000000000000000000000..d1fc3f3ccd25018a653b88b41a6080e3de2b62d1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0838c030-603b-4984-89ba-a31e41ca17ee.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1da793cb4b0e23d9e4409e978eb097070c73e4d14c937f1715d3bb4b2c41663 +size 55551 diff --git a/.lancedb/nltk_chunking.lance/data/084322e1-870e-4d28-9fd0-20ab0277e12e.lance b/.lancedb/nltk_chunking.lance/data/084322e1-870e-4d28-9fd0-20ab0277e12e.lance new file mode 100644 index 0000000000000000000000000000000000000000..7e5821b29ef3b4ac0b24e7d150d9f8d77f4928ab --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/084322e1-870e-4d28-9fd0-20ab0277e12e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6e7f3d1e1bdb3ae2df5121ba94fb96b01213e63c74cb614a1f35bd08aead336 +size 59376 diff --git a/.lancedb/nltk_chunking.lance/data/08bcee19-6656-4de1-b007-57b8d6583482.lance b/.lancedb/nltk_chunking.lance/data/08bcee19-6656-4de1-b007-57b8d6583482.lance new file mode 100644 index 0000000000000000000000000000000000000000..39ad809daf1fa5a2ea8138df32ffdb96423fe48f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/08bcee19-6656-4de1-b007-57b8d6583482.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b066b45cd213587838c04ad5d20a70aa015b03deda3e5b365c29ad87e60a9c8c +size 55083 diff --git a/.lancedb/nltk_chunking.lance/data/08e04dca-a112-41ed-8d79-3cb096f8960e.lance b/.lancedb/nltk_chunking.lance/data/08e04dca-a112-41ed-8d79-3cb096f8960e.lance new file mode 100644 index 0000000000000000000000000000000000000000..3e98ec385ca519dddcf247a46fe907a2215e0689 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/08e04dca-a112-41ed-8d79-3cb096f8960e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f713fd41313e1504d03e5f427a6ad23b99b5b5658d0c1eb64b174fd4a362d0a1 +size 56767 diff --git a/.lancedb/nltk_chunking.lance/data/0a14551a-cc66-4878-9a35-28f6276b61a6.lance b/.lancedb/nltk_chunking.lance/data/0a14551a-cc66-4878-9a35-28f6276b61a6.lance new file mode 100644 index 0000000000000000000000000000000000000000..4c70cb64d1642021b9c8910b7db3ca926375c58d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0a14551a-cc66-4878-9a35-28f6276b61a6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecdfa61f0daca4c597a36758d06ac0c0663833e0a69bdf12f8d8abb966b46f0d +size 58576 diff --git a/.lancedb/nltk_chunking.lance/data/0ad3d547-ba25-4d81-b62c-85c481589b16.lance b/.lancedb/nltk_chunking.lance/data/0ad3d547-ba25-4d81-b62c-85c481589b16.lance new file mode 100644 index 0000000000000000000000000000000000000000..6d1e228aa08f1afe5c66f50e1112f03bac10085b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0ad3d547-ba25-4d81-b62c-85c481589b16.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eedbc6bc975883692826e153fad7c45c2fa9c3791358396e4059dcb0784121d1 +size 54235 diff --git a/.lancedb/nltk_chunking.lance/data/0ae2c68a-5241-4dbd-9bd8-94b5ba5610a1.lance b/.lancedb/nltk_chunking.lance/data/0ae2c68a-5241-4dbd-9bd8-94b5ba5610a1.lance new file mode 100644 index 0000000000000000000000000000000000000000..d079df581a37278ce9586752b9681c0d39da4e1f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0ae2c68a-5241-4dbd-9bd8-94b5ba5610a1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f67f607e7b8c6d53fbda82691984f4eac60422dde1b1fc3858baea74c24fdf36 +size 58738 diff --git a/.lancedb/nltk_chunking.lance/data/0b704a79-1ccb-4beb-a5b2-1d939da65ae8.lance b/.lancedb/nltk_chunking.lance/data/0b704a79-1ccb-4beb-a5b2-1d939da65ae8.lance new file mode 100644 index 0000000000000000000000000000000000000000..874c80d1f65520d2d97cd1f31285bcf30e8dec77 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0b704a79-1ccb-4beb-a5b2-1d939da65ae8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:494806e4e57d0bb89481f630a9385d601a50bdeba653ddd63dae15d63df7eb32 +size 55962 diff --git a/.lancedb/nltk_chunking.lance/data/0b941ff1-e253-4042-ae3b-cd4a0030294d.lance b/.lancedb/nltk_chunking.lance/data/0b941ff1-e253-4042-ae3b-cd4a0030294d.lance new file mode 100644 index 0000000000000000000000000000000000000000..28dcb47a842c10fe78b048d230b1a2baab8537f4 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0b941ff1-e253-4042-ae3b-cd4a0030294d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d59d4acd634e0bcbc4b2bc43b47a44a36f82ae21f3eee6b2a72d8dd060a2df0f +size 55705 diff --git a/.lancedb/nltk_chunking.lance/data/0cb37b29-7949-41aa-93fb-a7d818c705b6.lance b/.lancedb/nltk_chunking.lance/data/0cb37b29-7949-41aa-93fb-a7d818c705b6.lance new file mode 100644 index 0000000000000000000000000000000000000000..fd2d168200513bc1b0f8f5cf58b314e89a4f7fd5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0cb37b29-7949-41aa-93fb-a7d818c705b6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0083c59f565d34133a783df254254331e16854f05cf0222c001f29206357d39d +size 54338 diff --git a/.lancedb/nltk_chunking.lance/data/0ce388c5-5784-4917-9358-46e4647bbcb3.lance b/.lancedb/nltk_chunking.lance/data/0ce388c5-5784-4917-9358-46e4647bbcb3.lance new file mode 100644 index 0000000000000000000000000000000000000000..f46822433605625db7b2bc67c3c795ed3c531121 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0ce388c5-5784-4917-9358-46e4647bbcb3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b14ace5533d82ca659cc724f1af605dc20647fcb7bbb2677ba40d2277f13a72 +size 60492 diff --git a/.lancedb/nltk_chunking.lance/data/0d00aac8-20cd-4961-81b5-fa227d435816.lance b/.lancedb/nltk_chunking.lance/data/0d00aac8-20cd-4961-81b5-fa227d435816.lance new file mode 100644 index 0000000000000000000000000000000000000000..e65cea43161ab9a1a56cf53712c0f4d7d8dcc4fe --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0d00aac8-20cd-4961-81b5-fa227d435816.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f892ddfccebcc07da5f5283176d636a5dae7720b2f7b2cad5afe57c49b258843 +size 59103 diff --git a/.lancedb/nltk_chunking.lance/data/0d146f14-4da8-4b7a-8406-e43df9c6ec65.lance b/.lancedb/nltk_chunking.lance/data/0d146f14-4da8-4b7a-8406-e43df9c6ec65.lance new file mode 100644 index 0000000000000000000000000000000000000000..ded5e990f9a89750d0382c572ec805a3ccb900d6 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0d146f14-4da8-4b7a-8406-e43df9c6ec65.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4774ed2669cd15738e070466e95a28e255f3357f35bba8ef0adc1073125163c +size 56598 diff --git a/.lancedb/nltk_chunking.lance/data/0d412caf-6c69-4e62-8f40-af5248de3808.lance b/.lancedb/nltk_chunking.lance/data/0d412caf-6c69-4e62-8f40-af5248de3808.lance new file mode 100644 index 0000000000000000000000000000000000000000..839c514986fd6a3334e38bd2cb4052e92dfea004 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0d412caf-6c69-4e62-8f40-af5248de3808.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ada6aadfb14c25e7567dbd99d014162c9e84cfbebab3c77ca991442657a63c5d +size 61513 diff --git a/.lancedb/nltk_chunking.lance/data/0ef14acc-8795-4722-9c8e-5ab4324948ea.lance b/.lancedb/nltk_chunking.lance/data/0ef14acc-8795-4722-9c8e-5ab4324948ea.lance new file mode 100644 index 0000000000000000000000000000000000000000..8d068235f11f24da5f800dffd275707581820877 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0ef14acc-8795-4722-9c8e-5ab4324948ea.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a46358246d4def8d5875eab07cc96420e83532142ca4f194a671904660fefafa +size 53922 diff --git a/.lancedb/nltk_chunking.lance/data/0fbf5d82-e20c-405f-bdbd-7ad2f55c7205.lance b/.lancedb/nltk_chunking.lance/data/0fbf5d82-e20c-405f-bdbd-7ad2f55c7205.lance new file mode 100644 index 0000000000000000000000000000000000000000..3156e50f0e8f01a4c0233e0ca1163f44e337fcfa --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/0fbf5d82-e20c-405f-bdbd-7ad2f55c7205.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ab871204b703d1b3303473efe8cac27d786180e30cb4678158a3e5d09d1ea60 +size 55416 diff --git a/.lancedb/nltk_chunking.lance/data/10e43a5d-7074-4a21-acc7-e1a0e019428a.lance b/.lancedb/nltk_chunking.lance/data/10e43a5d-7074-4a21-acc7-e1a0e019428a.lance new file mode 100644 index 0000000000000000000000000000000000000000..ccb9f8a5a1c776e9d74104a0a871fcca2945f94c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/10e43a5d-7074-4a21-acc7-e1a0e019428a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34059a1529cdf00a12cbe7cdb756f56895a3ace639c6fd3361c76cfeaf9bfa8c +size 55360 diff --git a/.lancedb/nltk_chunking.lance/data/10e9310e-ef2e-4da0-87ff-c6cb1b43e44b.lance b/.lancedb/nltk_chunking.lance/data/10e9310e-ef2e-4da0-87ff-c6cb1b43e44b.lance new file mode 100644 index 0000000000000000000000000000000000000000..82778ca565a0e56207b072e4706393b65b569e29 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/10e9310e-ef2e-4da0-87ff-c6cb1b43e44b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50e90d2369f5c02fd59bfb10b3247dc9d3e405654d6e8ec32cabdd6a890ff647 +size 53819 diff --git a/.lancedb/nltk_chunking.lance/data/1234ea02-71fe-4283-bf7f-7fd33795e618.lance b/.lancedb/nltk_chunking.lance/data/1234ea02-71fe-4283-bf7f-7fd33795e618.lance new file mode 100644 index 0000000000000000000000000000000000000000..f43d16bd5511dedd2dd9f7b187e805f72280a3fb --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/1234ea02-71fe-4283-bf7f-7fd33795e618.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fbf495dc49671362c2c9bba3c54da3b5eb70422773d819cf9f60ae70a587e06 +size 58956 diff --git a/.lancedb/nltk_chunking.lance/data/1251e092-d725-409e-89a6-95b7cd05258c.lance b/.lancedb/nltk_chunking.lance/data/1251e092-d725-409e-89a6-95b7cd05258c.lance new file mode 100644 index 0000000000000000000000000000000000000000..be4dffadd624b1aeb10ddfd14f1611e5d7e98965 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/1251e092-d725-409e-89a6-95b7cd05258c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c7ea7e3b9112a8562d4df03cc4c28a7cdd99ba5b3e1b5a7b2e48e750b2ca441 +size 57790 diff --git a/.lancedb/nltk_chunking.lance/data/12609d75-7ec1-4a25-857a-7b8c4e84e837.lance b/.lancedb/nltk_chunking.lance/data/12609d75-7ec1-4a25-857a-7b8c4e84e837.lance new file mode 100644 index 0000000000000000000000000000000000000000..42a96efc284e82c8b25fc7921cf87d0510142c9e --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/12609d75-7ec1-4a25-857a-7b8c4e84e837.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c240e00c0cbcc35377157c02a225eae36b7ae1714acba7f4cfa17115430f13b +size 57405 diff --git a/.lancedb/nltk_chunking.lance/data/126e151c-1c49-4973-861f-25d268a1c1a8.lance b/.lancedb/nltk_chunking.lance/data/126e151c-1c49-4973-861f-25d268a1c1a8.lance new file mode 100644 index 0000000000000000000000000000000000000000..5b60fefa4956b0402bae2f423930c36df6a4b5ad --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/126e151c-1c49-4973-861f-25d268a1c1a8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ae383bf4a0e0497f3cb0761547d4c050d2624c4a42f4cbbd6f874b3eecbcd0 +size 57833 diff --git a/.lancedb/nltk_chunking.lance/data/13abde94-cf49-44bd-9261-30f430cc83d6.lance b/.lancedb/nltk_chunking.lance/data/13abde94-cf49-44bd-9261-30f430cc83d6.lance new file mode 100644 index 0000000000000000000000000000000000000000..ae860a6abc3b66ce937985ce101505236ff8940e --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/13abde94-cf49-44bd-9261-30f430cc83d6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfcf286a5a8aca604319c5e951bdbec370d0fd75f6f762d40757dc351c206dc2 +size 55910 diff --git a/.lancedb/nltk_chunking.lance/data/13b4d01d-9f4a-4fbc-b5e3-e773ae851490.lance b/.lancedb/nltk_chunking.lance/data/13b4d01d-9f4a-4fbc-b5e3-e773ae851490.lance new file mode 100644 index 0000000000000000000000000000000000000000..85af333af0b86351f76dbbcc85a2b722c4c4f7bc --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/13b4d01d-9f4a-4fbc-b5e3-e773ae851490.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b93b96cd9d7c86dc259f41b48d957dd59483349d270be6ff1a934c37b0bf6d27 +size 55817 diff --git a/.lancedb/nltk_chunking.lance/data/14386b20-5e06-48e6-a933-56b13e5ea092.lance b/.lancedb/nltk_chunking.lance/data/14386b20-5e06-48e6-a933-56b13e5ea092.lance new file mode 100644 index 0000000000000000000000000000000000000000..9e3e0abca3d8c954125277efc3c18ae2b61fc081 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/14386b20-5e06-48e6-a933-56b13e5ea092.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:423904778f41d5e7a18ebebbf130613e8a90d56064a9afde48855ba1f10a103c +size 59598 diff --git a/.lancedb/nltk_chunking.lance/data/143e4bed-795b-40b5-bf1a-1e2000cb9989.lance b/.lancedb/nltk_chunking.lance/data/143e4bed-795b-40b5-bf1a-1e2000cb9989.lance new file mode 100644 index 0000000000000000000000000000000000000000..28fe3fc179a87a2d8db25450e89ed70042ced73c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/143e4bed-795b-40b5-bf1a-1e2000cb9989.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:885810f52c4136b243b67ceda4b068ec00770b52799156d9da74a4c472f5a560 +size 55683 diff --git a/.lancedb/nltk_chunking.lance/data/146f4623-f84f-48b0-aca7-9842961a72dc.lance b/.lancedb/nltk_chunking.lance/data/146f4623-f84f-48b0-aca7-9842961a72dc.lance new file mode 100644 index 0000000000000000000000000000000000000000..418e550654d7b0bd1dc87aa375e22ab4cbd5ea7e --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/146f4623-f84f-48b0-aca7-9842961a72dc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3392d058ed700720a5d22febf07221e0ac1c9461eec4b0b6c32a3a071ec3123d +size 55849 diff --git a/.lancedb/nltk_chunking.lance/data/14c8f517-8cb4-4e41-95a7-2b3b37b8fda5.lance b/.lancedb/nltk_chunking.lance/data/14c8f517-8cb4-4e41-95a7-2b3b37b8fda5.lance new file mode 100644 index 0000000000000000000000000000000000000000..5bc3d812921d1986b42bda757df06846e10702bb --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/14c8f517-8cb4-4e41-95a7-2b3b37b8fda5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d65fa1742901e6e27d1165233252005ca3782c097407a3d74b14d484fe34853f +size 53635 diff --git a/.lancedb/nltk_chunking.lance/data/16b50dce-e311-448a-ab8d-d3883297cb98.lance b/.lancedb/nltk_chunking.lance/data/16b50dce-e311-448a-ab8d-d3883297cb98.lance new file mode 100644 index 0000000000000000000000000000000000000000..287b5d054801ba673af5af9a15c71ea2270e9269 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/16b50dce-e311-448a-ab8d-d3883297cb98.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3021057b9ad990001da5539879f927abf604c505ec203b75abc2dfdebe3bcc97 +size 57161 diff --git a/.lancedb/nltk_chunking.lance/data/17731287-68f1-4b2e-bf2e-84b8a6d7cc5d.lance b/.lancedb/nltk_chunking.lance/data/17731287-68f1-4b2e-bf2e-84b8a6d7cc5d.lance new file mode 100644 index 0000000000000000000000000000000000000000..424f844b140ecae3b765d254361ab496d94c1e27 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/17731287-68f1-4b2e-bf2e-84b8a6d7cc5d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f882de22ab90943615470954cd7cb446096dda095c57fd188586b25753842600 +size 56528 diff --git a/.lancedb/nltk_chunking.lance/data/185bffc0-32bf-479c-8829-cc59f21c82e5.lance b/.lancedb/nltk_chunking.lance/data/185bffc0-32bf-479c-8829-cc59f21c82e5.lance new file mode 100644 index 0000000000000000000000000000000000000000..e342489c88b43088c8c0ee7597fbd724974a559d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/185bffc0-32bf-479c-8829-cc59f21c82e5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11d3fb8e005a2e167111c80af2983213f34d30dc6d1be3f8ad678cfd356da102 +size 54580 diff --git a/.lancedb/nltk_chunking.lance/data/197d5843-57f3-4a70-b27f-fe6583816fb8.lance b/.lancedb/nltk_chunking.lance/data/197d5843-57f3-4a70-b27f-fe6583816fb8.lance new file mode 100644 index 0000000000000000000000000000000000000000..3fe63656eafed91f250e6742c0bc48b476b16237 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/197d5843-57f3-4a70-b27f-fe6583816fb8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35a2fbbe2483795723d8d854ac4f0be4550d3f385c8213ee7095ce45bf2434de +size 56782 diff --git a/.lancedb/nltk_chunking.lance/data/1b5c156d-07cf-4535-a76b-2539676646c5.lance b/.lancedb/nltk_chunking.lance/data/1b5c156d-07cf-4535-a76b-2539676646c5.lance new file mode 100644 index 0000000000000000000000000000000000000000..b5e77198d359283964b89b8c3e7172e34f4a2f46 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/1b5c156d-07cf-4535-a76b-2539676646c5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:232ec4eb2f9e8ce0859c9747a7a863ee9ba0b1e140764cac4779f7770ab366cf +size 58425 diff --git a/.lancedb/nltk_chunking.lance/data/1bbba7cb-0e94-4620-8874-ce7b8ac40ef0.lance b/.lancedb/nltk_chunking.lance/data/1bbba7cb-0e94-4620-8874-ce7b8ac40ef0.lance new file mode 100644 index 0000000000000000000000000000000000000000..3274322a82623f189a0444a354988aecd2adad76 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/1bbba7cb-0e94-4620-8874-ce7b8ac40ef0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e00805b87d83bd05e931135cf072cbf90f93cc32afe9612d47115e5a79b816cf +size 58113 diff --git a/.lancedb/nltk_chunking.lance/data/1bc384ae-24f5-4bf2-bae3-792e36bd05a6.lance b/.lancedb/nltk_chunking.lance/data/1bc384ae-24f5-4bf2-bae3-792e36bd05a6.lance new file mode 100644 index 0000000000000000000000000000000000000000..d61bddecc2569a1e7c6c27a585fe7fe425bb782c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/1bc384ae-24f5-4bf2-bae3-792e36bd05a6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b597b87939f4cb090f2f122c712dad5eebb7c06a7a6adb09999b22284d2917a8 +size 54024 diff --git a/.lancedb/nltk_chunking.lance/data/1d628e54-366d-4883-86fb-037e4209442b.lance b/.lancedb/nltk_chunking.lance/data/1d628e54-366d-4883-86fb-037e4209442b.lance new file mode 100644 index 0000000000000000000000000000000000000000..e0943e459c029e3ea995013b93296f821d0ada5d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/1d628e54-366d-4883-86fb-037e4209442b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d9ef1b2ca4dcb85425a736eff9646bbee357e8a41c596948dae43c961642675 +size 54261 diff --git a/.lancedb/nltk_chunking.lance/data/1d984795-76cb-473e-8ee7-fefeeea92702.lance b/.lancedb/nltk_chunking.lance/data/1d984795-76cb-473e-8ee7-fefeeea92702.lance new file mode 100644 index 0000000000000000000000000000000000000000..2c6dbcd8bc650c845511083d8302227b9e2b592b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/1d984795-76cb-473e-8ee7-fefeeea92702.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1859a5e9c516f93aa1c472ad7a5053e886a517298553ee60ec6fdd9fd2e4387d +size 54990 diff --git a/.lancedb/nltk_chunking.lance/data/1e01c39c-7023-482a-821d-0e950e4cb1d2.lance b/.lancedb/nltk_chunking.lance/data/1e01c39c-7023-482a-821d-0e950e4cb1d2.lance new file mode 100644 index 0000000000000000000000000000000000000000..22cbe2b57a5098023adef1fe08f870d4aaa503f5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/1e01c39c-7023-482a-821d-0e950e4cb1d2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c667b647d0c18e400e334a9194b8b3186bf6edfc740d6a9eafb7a3bac666a3f +size 54119 diff --git a/.lancedb/nltk_chunking.lance/data/1e79de0e-e45a-4313-b88e-416be092282a.lance b/.lancedb/nltk_chunking.lance/data/1e79de0e-e45a-4313-b88e-416be092282a.lance new file mode 100644 index 0000000000000000000000000000000000000000..387a36b3e37093ac8d75e4187492e75276f800d4 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/1e79de0e-e45a-4313-b88e-416be092282a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ba593d1d2916a95a9b78bec809277b21919a8191e4be61ab8c8dd7ce0ee9bba +size 56689 diff --git a/.lancedb/nltk_chunking.lance/data/1f39b1ac-0107-4242-a50c-3017fa03f477.lance b/.lancedb/nltk_chunking.lance/data/1f39b1ac-0107-4242-a50c-3017fa03f477.lance new file mode 100644 index 0000000000000000000000000000000000000000..0f98860b2d927370de458d028cc7184c607f15ee --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/1f39b1ac-0107-4242-a50c-3017fa03f477.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9885376a573a890b8808644a5faeb1220b3d181c6b1c3eb5906b4661f5e4b398 +size 45014 diff --git a/.lancedb/nltk_chunking.lance/data/1f3b12be-b517-49e1-aa43-9c8114e4a4ad.lance b/.lancedb/nltk_chunking.lance/data/1f3b12be-b517-49e1-aa43-9c8114e4a4ad.lance new file mode 100644 index 0000000000000000000000000000000000000000..63ff59ed49274208ade0615d2146e5b27e45a430 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/1f3b12be-b517-49e1-aa43-9c8114e4a4ad.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22d8fd248556a88db9fffe191476ba416c9d093068a9c4f00ec5fdd6402454a4 +size 57698 diff --git a/.lancedb/nltk_chunking.lance/data/20213419-da77-4598-a8de-661f05397032.lance b/.lancedb/nltk_chunking.lance/data/20213419-da77-4598-a8de-661f05397032.lance new file mode 100644 index 0000000000000000000000000000000000000000..56231fd462f02e40085f9a726815021664979847 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/20213419-da77-4598-a8de-661f05397032.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddca841c4eae65d7610d6181e14c825dd57247688a51ae924d0eb9f4954c7d43 +size 54020 diff --git a/.lancedb/nltk_chunking.lance/data/204e24de-1561-40a2-8943-fa3fc80cef2b.lance b/.lancedb/nltk_chunking.lance/data/204e24de-1561-40a2-8943-fa3fc80cef2b.lance new file mode 100644 index 0000000000000000000000000000000000000000..9453afb9cc9f529d3572878fd32820e4cc08f7ec --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/204e24de-1561-40a2-8943-fa3fc80cef2b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:517d9cf66d01412a0158120891b553faacd7a6c19c9d341e0b4714e304ca0186 +size 56069 diff --git a/.lancedb/nltk_chunking.lance/data/20a1b031-0af1-4653-97b7-67d1af953448.lance b/.lancedb/nltk_chunking.lance/data/20a1b031-0af1-4653-97b7-67d1af953448.lance new file mode 100644 index 0000000000000000000000000000000000000000..42ee752d055ff15f0b61259d9d05e730f8df2502 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/20a1b031-0af1-4653-97b7-67d1af953448.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02c2e057413e50ea1cfaed4d1e8cafa67b461f07a33cba54a440f43a4643e23c +size 56165 diff --git a/.lancedb/nltk_chunking.lance/data/20ab9ea3-a44f-401b-9644-dabae82043bc.lance b/.lancedb/nltk_chunking.lance/data/20ab9ea3-a44f-401b-9644-dabae82043bc.lance new file mode 100644 index 0000000000000000000000000000000000000000..7cc46c36ea1b38932ba7c9313d36983fa7437a0f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/20ab9ea3-a44f-401b-9644-dabae82043bc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff960f99178a8308189f80f72d5fc66e0bf1d58a514ff85e06021745a68eaf86 +size 54654 diff --git a/.lancedb/nltk_chunking.lance/data/20d9cccf-fe87-4638-80d1-69b4f5911e32.lance b/.lancedb/nltk_chunking.lance/data/20d9cccf-fe87-4638-80d1-69b4f5911e32.lance new file mode 100644 index 0000000000000000000000000000000000000000..6abdf6e12677072595a489dda5ec0a9b9966ed62 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/20d9cccf-fe87-4638-80d1-69b4f5911e32.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a0f98bada0bc19d1b64d4649c8fafbd2b16a9651e57458a8ce25c7832a1f1bf +size 53521 diff --git a/.lancedb/nltk_chunking.lance/data/215f216c-e9a4-4e19-8f49-be754e982234.lance b/.lancedb/nltk_chunking.lance/data/215f216c-e9a4-4e19-8f49-be754e982234.lance new file mode 100644 index 0000000000000000000000000000000000000000..e5144db2438a800f40d9d25bf737241d2882125c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/215f216c-e9a4-4e19-8f49-be754e982234.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5d28534b69ca13d48ea12edc0d9214cf3ba56ccd059e2bcff5bc27d2aef10c9 +size 55954 diff --git a/.lancedb/nltk_chunking.lance/data/2178428d-8d17-48c2-b977-5aae25511e2e.lance b/.lancedb/nltk_chunking.lance/data/2178428d-8d17-48c2-b977-5aae25511e2e.lance new file mode 100644 index 0000000000000000000000000000000000000000..617ef95028b5b6b726bacd9aa98c6a00372f9949 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/2178428d-8d17-48c2-b977-5aae25511e2e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c725d7631bfe276683d0be835ff061f9fa4508a2e60e6b1e61d8d2f23fd52d16 +size 54931 diff --git a/.lancedb/nltk_chunking.lance/data/218636ae-8896-4008-b686-a3bfdadf9f02.lance b/.lancedb/nltk_chunking.lance/data/218636ae-8896-4008-b686-a3bfdadf9f02.lance new file mode 100644 index 0000000000000000000000000000000000000000..2830d99a0e5216011db2586c42393a16550e6eca --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/218636ae-8896-4008-b686-a3bfdadf9f02.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ddf46eccc2273ccad0a276a0a324a84e97c0c2dcc102b4ddf93146fb4f4e01d +size 53711 diff --git a/.lancedb/nltk_chunking.lance/data/227e8895-1d37-41d9-8438-d0e52f667f6e.lance b/.lancedb/nltk_chunking.lance/data/227e8895-1d37-41d9-8438-d0e52f667f6e.lance new file mode 100644 index 0000000000000000000000000000000000000000..8e4acf8cfc6942f7a5c669787a5a989aa8a5b85b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/227e8895-1d37-41d9-8438-d0e52f667f6e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e38b09f0cb39b118c20d0348a70c1bb619e41d38f1ce4479bf09b3d42f0c3d8 +size 55387 diff --git a/.lancedb/nltk_chunking.lance/data/22c2b793-66ff-441e-b671-5c92812c35d5.lance b/.lancedb/nltk_chunking.lance/data/22c2b793-66ff-441e-b671-5c92812c35d5.lance new file mode 100644 index 0000000000000000000000000000000000000000..a50afdff4473f2f86230b7e987eb1bee511f431a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/22c2b793-66ff-441e-b671-5c92812c35d5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b583abe4bc02a730f9099aaf9f629fe704eb9bd76e82894a77888370746b380a +size 55912 diff --git a/.lancedb/nltk_chunking.lance/data/231c3ed8-17fd-423b-93c6-f6db30c3b3a4.lance b/.lancedb/nltk_chunking.lance/data/231c3ed8-17fd-423b-93c6-f6db30c3b3a4.lance new file mode 100644 index 0000000000000000000000000000000000000000..50314b59c521c3b09286d23d578bcdc1340e436c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/231c3ed8-17fd-423b-93c6-f6db30c3b3a4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ca39205f71c5f1689a964326e26b4647302c287eb7618bbcfc6a10835fb8a0a +size 53656 diff --git a/.lancedb/nltk_chunking.lance/data/233a64ab-e5d2-41bb-af48-7cd3238865bc.lance b/.lancedb/nltk_chunking.lance/data/233a64ab-e5d2-41bb-af48-7cd3238865bc.lance new file mode 100644 index 0000000000000000000000000000000000000000..54d1543be9529737a71462b5555d8ce9327c658b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/233a64ab-e5d2-41bb-af48-7cd3238865bc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fda72917d23e99f26ec3fcff8959589a37b5e8241bd1b434156701a61578954e +size 53686 diff --git a/.lancedb/nltk_chunking.lance/data/235bb6a2-3ed6-4752-9cde-b0ee6b2d1603.lance b/.lancedb/nltk_chunking.lance/data/235bb6a2-3ed6-4752-9cde-b0ee6b2d1603.lance new file mode 100644 index 0000000000000000000000000000000000000000..67a964954982c7e588780a64f10b8ea580870e4b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/235bb6a2-3ed6-4752-9cde-b0ee6b2d1603.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62b9b158932c3377c04966cfa960813e7cdd016b32b752802e4cade51ba4a8dc +size 56044 diff --git a/.lancedb/nltk_chunking.lance/data/23ab7ac4-34c6-4635-9d42-c021e7364c7c.lance b/.lancedb/nltk_chunking.lance/data/23ab7ac4-34c6-4635-9d42-c021e7364c7c.lance new file mode 100644 index 0000000000000000000000000000000000000000..3a78cba530648d25e3f188986eba8f8557d65cf2 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/23ab7ac4-34c6-4635-9d42-c021e7364c7c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:705ee92898bbff7661d1fb8c544f305da9978fb9879582bd0ddbac22e5d51f34 +size 56452 diff --git a/.lancedb/nltk_chunking.lance/data/23bf6a09-29d0-4115-8b85-3c0bfc1bfeaf.lance b/.lancedb/nltk_chunking.lance/data/23bf6a09-29d0-4115-8b85-3c0bfc1bfeaf.lance new file mode 100644 index 0000000000000000000000000000000000000000..473c96c8df3da29a4581029514710b0c4efab37e --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/23bf6a09-29d0-4115-8b85-3c0bfc1bfeaf.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f67d38a284e6db55cc6008c65f5d22063e709b0bbb85fd2bbcf1ccb855d492b2 +size 57928 diff --git a/.lancedb/nltk_chunking.lance/data/263d35f7-ee19-4f09-be8f-91911630d95b.lance b/.lancedb/nltk_chunking.lance/data/263d35f7-ee19-4f09-be8f-91911630d95b.lance new file mode 100644 index 0000000000000000000000000000000000000000..5922af4d71561777334b29ad64fa59b0d38bd026 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/263d35f7-ee19-4f09-be8f-91911630d95b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:897668b7996b05c41c45da5cf38e764dbbf8f98ae940acc2f6634d53d47dc1c0 +size 55981 diff --git a/.lancedb/nltk_chunking.lance/data/265bd055-6cd6-4eec-9ca8-2524d6a26f60.lance b/.lancedb/nltk_chunking.lance/data/265bd055-6cd6-4eec-9ca8-2524d6a26f60.lance new file mode 100644 index 0000000000000000000000000000000000000000..b9daa19ac9fe3463507090a9b8c27c5c7d2ff497 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/265bd055-6cd6-4eec-9ca8-2524d6a26f60.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:426d11d5703333b61876a6cb08cea1113f31de009a48e57d1072fbcc2b34a030 +size 55474 diff --git a/.lancedb/nltk_chunking.lance/data/26acd698-b85c-49b4-9371-7f880c2709de.lance b/.lancedb/nltk_chunking.lance/data/26acd698-b85c-49b4-9371-7f880c2709de.lance new file mode 100644 index 0000000000000000000000000000000000000000..3723b8f1a80a040782a5f30f2b714b9c19fcbab1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/26acd698-b85c-49b4-9371-7f880c2709de.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:946647f1848cb278aa55fc01ca949f2c563094929b9deac0ae1ca189209f7887 +size 56084 diff --git a/.lancedb/nltk_chunking.lance/data/276a2803-aace-48c5-810e-8c49060b1691.lance b/.lancedb/nltk_chunking.lance/data/276a2803-aace-48c5-810e-8c49060b1691.lance new file mode 100644 index 0000000000000000000000000000000000000000..8254eb476eec4f8b50adc69682a030ef58b16d18 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/276a2803-aace-48c5-810e-8c49060b1691.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe402e3132791af17d467fd58b0c132d30c2ef61eb96edad270ce3c19edd84a4 +size 54813 diff --git a/.lancedb/nltk_chunking.lance/data/27e66a89-98f7-4fff-840f-9536d09eb11a.lance b/.lancedb/nltk_chunking.lance/data/27e66a89-98f7-4fff-840f-9536d09eb11a.lance new file mode 100644 index 0000000000000000000000000000000000000000..4b6ae731ef0c5ae16ca26dbe59de3d9d5366856d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/27e66a89-98f7-4fff-840f-9536d09eb11a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed633b2a9eefd8b410c60a247d6f85dab09f50f984065e9588149a0b7fdf855 +size 55600 diff --git a/.lancedb/nltk_chunking.lance/data/27f3eba3-307a-441c-b552-1f8bef546c40.lance b/.lancedb/nltk_chunking.lance/data/27f3eba3-307a-441c-b552-1f8bef546c40.lance new file mode 100644 index 0000000000000000000000000000000000000000..31cd50a0aa68a5e9955bb0c3ace42d411da58b53 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/27f3eba3-307a-441c-b552-1f8bef546c40.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7398bca420ad195c736aa0fad6dad47cb934a837bdf93f848d2acaffa42fcd5a +size 54586 diff --git a/.lancedb/nltk_chunking.lance/data/292f9768-9b0b-45e6-8a98-18726721fc2b.lance b/.lancedb/nltk_chunking.lance/data/292f9768-9b0b-45e6-8a98-18726721fc2b.lance new file mode 100644 index 0000000000000000000000000000000000000000..a5afc2c441a7fee2fb87b5d2a78075fff7d9baa6 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/292f9768-9b0b-45e6-8a98-18726721fc2b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ae685e296d6b5019a0717a79e759c0e4073dd139a0afbb6aaa9a7444c0e599b +size 55479 diff --git a/.lancedb/nltk_chunking.lance/data/293e94ba-8279-4e6c-a138-7155830fe6f9.lance b/.lancedb/nltk_chunking.lance/data/293e94ba-8279-4e6c-a138-7155830fe6f9.lance new file mode 100644 index 0000000000000000000000000000000000000000..6f7f3fc2cfc10b3f2b621cf9dfea6630a9279b57 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/293e94ba-8279-4e6c-a138-7155830fe6f9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34113f4fb618a28a8974ff93d750ad88470741e759e3142a8bbb5b586807523c +size 54894 diff --git a/.lancedb/nltk_chunking.lance/data/295f0827-76fc-4cad-bf87-763f8a142dfa.lance b/.lancedb/nltk_chunking.lance/data/295f0827-76fc-4cad-bf87-763f8a142dfa.lance new file mode 100644 index 0000000000000000000000000000000000000000..047bb88351868f90e026e7e2af38e42d3ef6d5e5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/295f0827-76fc-4cad-bf87-763f8a142dfa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50b0adbc169009363682890ac0bd8bff8157fafd8697421d0294cdc2f5d21288 +size 57682 diff --git a/.lancedb/nltk_chunking.lance/data/2b9a08c2-1d6a-4cbd-922d-2f0768efe80a.lance b/.lancedb/nltk_chunking.lance/data/2b9a08c2-1d6a-4cbd-922d-2f0768efe80a.lance new file mode 100644 index 0000000000000000000000000000000000000000..6a435b18495e4e7b8346b4d657dc9e10f2112f57 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/2b9a08c2-1d6a-4cbd-922d-2f0768efe80a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5322ad3b113fe70336f0e03bfa474f184f48a0ceab51023731208747c3e67bce +size 55155 diff --git a/.lancedb/nltk_chunking.lance/data/2bbc1260-0e66-483c-8162-451569232973.lance b/.lancedb/nltk_chunking.lance/data/2bbc1260-0e66-483c-8162-451569232973.lance new file mode 100644 index 0000000000000000000000000000000000000000..144dfc56f5915485be2ddca2948b5f1bef615e30 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/2bbc1260-0e66-483c-8162-451569232973.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:460b980eb34fe514b1bca298988a2fd9a73d9e9301006996d8e661fb4de54501 +size 54841 diff --git a/.lancedb/nltk_chunking.lance/data/2e354148-6016-42e5-9359-b78853486f38.lance b/.lancedb/nltk_chunking.lance/data/2e354148-6016-42e5-9359-b78853486f38.lance new file mode 100644 index 0000000000000000000000000000000000000000..ea7207f18b2d24e7e87c9da72080d7095081b827 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/2e354148-6016-42e5-9359-b78853486f38.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:378b8ce95eec174c9e53de9365912dcb95990a7227a2914ec0b5de09cbe84237 +size 59432 diff --git a/.lancedb/nltk_chunking.lance/data/2e848111-92b9-4516-a6d2-2427bb152498.lance b/.lancedb/nltk_chunking.lance/data/2e848111-92b9-4516-a6d2-2427bb152498.lance new file mode 100644 index 0000000000000000000000000000000000000000..b7cf380fbfa8624213f31381563ac553e7fdce4f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/2e848111-92b9-4516-a6d2-2427bb152498.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a95933428880eb6f482ca167802db8700687afe9941ff3cbd8bad2352e3a4db3 +size 54859 diff --git a/.lancedb/nltk_chunking.lance/data/2ed31d4c-d137-4380-9021-d0f4616a67fc.lance b/.lancedb/nltk_chunking.lance/data/2ed31d4c-d137-4380-9021-d0f4616a67fc.lance new file mode 100644 index 0000000000000000000000000000000000000000..981b5915b7b38f62aac445adaabeb0ebc029b035 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/2ed31d4c-d137-4380-9021-d0f4616a67fc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fae815e883fbf0bae9c5764994e1f8f0b831f1d6be1c679961da1002a61529f +size 55732 diff --git a/.lancedb/nltk_chunking.lance/data/2fb552df-01ba-480c-9a64-9b61614c1d13.lance b/.lancedb/nltk_chunking.lance/data/2fb552df-01ba-480c-9a64-9b61614c1d13.lance new file mode 100644 index 0000000000000000000000000000000000000000..4bc3d73354f4866f073d548e5d6b30c0c9f71070 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/2fb552df-01ba-480c-9a64-9b61614c1d13.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfa18b9097f98cea90aa17f89a0745113e9fbbe75d524e9c4bf1f1a1533fe162 +size 56013 diff --git a/.lancedb/nltk_chunking.lance/data/2fcf9970-6c05-498a-b78a-5abe6c4187e5.lance b/.lancedb/nltk_chunking.lance/data/2fcf9970-6c05-498a-b78a-5abe6c4187e5.lance new file mode 100644 index 0000000000000000000000000000000000000000..de628fc50a55a4454733ccc86ec69c08b5d601a3 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/2fcf9970-6c05-498a-b78a-5abe6c4187e5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3d06ff5cb624399a35cf868a8c21c114506969518d9db1af582e01358fb2ddb +size 53374 diff --git a/.lancedb/nltk_chunking.lance/data/2ff65be8-d55b-454e-9b7e-b90c2aa673be.lance b/.lancedb/nltk_chunking.lance/data/2ff65be8-d55b-454e-9b7e-b90c2aa673be.lance new file mode 100644 index 0000000000000000000000000000000000000000..9cba5e4a3c038071dd8bdf252f2ced5803192661 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/2ff65be8-d55b-454e-9b7e-b90c2aa673be.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ac7f23e8a3c8ec1791d6c93493ee36d40886592f2a9c08f2a598288299f6802 +size 55375 diff --git a/.lancedb/nltk_chunking.lance/data/3067defa-01c6-482b-8d6a-773c7b4225df.lance b/.lancedb/nltk_chunking.lance/data/3067defa-01c6-482b-8d6a-773c7b4225df.lance new file mode 100644 index 0000000000000000000000000000000000000000..0ef0dac32b8538b125371f4c3e77c2d7fd822fec --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3067defa-01c6-482b-8d6a-773c7b4225df.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7de0564398eae9737447d0d9e614658e5cb0bfe3cb9434550ead4b34c65a21bf +size 53357 diff --git a/.lancedb/nltk_chunking.lance/data/3175054c-a59d-457c-815c-9dfb7db53949.lance b/.lancedb/nltk_chunking.lance/data/3175054c-a59d-457c-815c-9dfb7db53949.lance new file mode 100644 index 0000000000000000000000000000000000000000..2d5cc4d6fc226d2dd78d554682e6d918ff24c5a4 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3175054c-a59d-457c-815c-9dfb7db53949.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d634f8daa41f638d33a8c5f56b3e0e25f61f6f2ab45922546b6dcfb5276c981e +size 55151 diff --git a/.lancedb/nltk_chunking.lance/data/320ac48f-b2b6-4472-bd68-e095c0221680.lance b/.lancedb/nltk_chunking.lance/data/320ac48f-b2b6-4472-bd68-e095c0221680.lance new file mode 100644 index 0000000000000000000000000000000000000000..a161f8b28bf64da31ff82b9f1e27d4d9dc7b32d1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/320ac48f-b2b6-4472-bd68-e095c0221680.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3630716ec5870e23efa8fc549789f3a21102d5c4e739ea21b88b9c7230a6905 +size 56840 diff --git a/.lancedb/nltk_chunking.lance/data/321c0da2-8d2f-43d6-8d73-d3257d2c8e34.lance b/.lancedb/nltk_chunking.lance/data/321c0da2-8d2f-43d6-8d73-d3257d2c8e34.lance new file mode 100644 index 0000000000000000000000000000000000000000..4175d18a622b3eb1143d4130f238b340c8c90b72 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/321c0da2-8d2f-43d6-8d73-d3257d2c8e34.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a261f9eca5f302274a7ca473c017d467a272100c3fc10b3f85edc41c552f0757 +size 55412 diff --git a/.lancedb/nltk_chunking.lance/data/3244579b-167c-4a75-b165-3d2b32892d32.lance b/.lancedb/nltk_chunking.lance/data/3244579b-167c-4a75-b165-3d2b32892d32.lance new file mode 100644 index 0000000000000000000000000000000000000000..24abbef6521412290411a17d8ea88789c3939570 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3244579b-167c-4a75-b165-3d2b32892d32.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75a065d922248db040d3d2252bc4fbf1b5629ad53c7b9cf134da5a17a1425356 +size 54664 diff --git a/.lancedb/nltk_chunking.lance/data/3277029e-2012-4710-89d8-b01d9adca93e.lance b/.lancedb/nltk_chunking.lance/data/3277029e-2012-4710-89d8-b01d9adca93e.lance new file mode 100644 index 0000000000000000000000000000000000000000..52e48373ff8b41c8f1e089da8d31e9f4d1c8613b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3277029e-2012-4710-89d8-b01d9adca93e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deb7699df1016ff74a87763452b51523176a34b531adb066305df9e34ac81a7a +size 54500 diff --git a/.lancedb/nltk_chunking.lance/data/32b93cfc-f98f-4be0-9505-475ad30bdd90.lance b/.lancedb/nltk_chunking.lance/data/32b93cfc-f98f-4be0-9505-475ad30bdd90.lance new file mode 100644 index 0000000000000000000000000000000000000000..5a88203c6c29fd47e219b74352bb4702e461e932 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/32b93cfc-f98f-4be0-9505-475ad30bdd90.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac4ab46f511db9cfa61b5ea23d85dc53c707370eb1ee543136df93eeda7cf5fc +size 54261 diff --git a/.lancedb/nltk_chunking.lance/data/337018cc-ac4a-4f10-82a9-8cd94c2c8855.lance b/.lancedb/nltk_chunking.lance/data/337018cc-ac4a-4f10-82a9-8cd94c2c8855.lance new file mode 100644 index 0000000000000000000000000000000000000000..3d8501101ec47d9ed1f2b02812a09f691ffb68da --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/337018cc-ac4a-4f10-82a9-8cd94c2c8855.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08f187ab7a1a0aa7363fd44c75be91b98225df59bd29004c63bd06d81c7cfd8f +size 54582 diff --git a/.lancedb/nltk_chunking.lance/data/33cffb49-bd84-4671-84b5-aff6f7ebff20.lance b/.lancedb/nltk_chunking.lance/data/33cffb49-bd84-4671-84b5-aff6f7ebff20.lance new file mode 100644 index 0000000000000000000000000000000000000000..3dd2789b88d12c8e94a503fa67d8289f84ed8680 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/33cffb49-bd84-4671-84b5-aff6f7ebff20.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9369b45d7273be1f94f7cefd622a62a721be31940129354712ef7bf09ec7428e +size 54029 diff --git a/.lancedb/nltk_chunking.lance/data/33e05b61-7b22-469f-bbb3-0b407ab1604a.lance b/.lancedb/nltk_chunking.lance/data/33e05b61-7b22-469f-bbb3-0b407ab1604a.lance new file mode 100644 index 0000000000000000000000000000000000000000..bc3990fbd36b00a591606af04aa8ef51fbd8d849 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/33e05b61-7b22-469f-bbb3-0b407ab1604a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a56480fe6f0671312d075bbf746f1ec58c80fc1cc0aec053a838804805edc65c +size 55456 diff --git a/.lancedb/nltk_chunking.lance/data/34954ed2-1cc0-401c-8d76-62fa10b3271e.lance b/.lancedb/nltk_chunking.lance/data/34954ed2-1cc0-401c-8d76-62fa10b3271e.lance new file mode 100644 index 0000000000000000000000000000000000000000..f3a6c77d0fc73f5f1346857d150a710e629b6a8e --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/34954ed2-1cc0-401c-8d76-62fa10b3271e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6a65ffa0dace1970d54580804a85f73fe870f1b83e1f0d240c1a61cdc9c008d +size 54400 diff --git a/.lancedb/nltk_chunking.lance/data/34d9615f-c059-4efc-a51b-da1f8105635f.lance b/.lancedb/nltk_chunking.lance/data/34d9615f-c059-4efc-a51b-da1f8105635f.lance new file mode 100644 index 0000000000000000000000000000000000000000..4b5bb373eb8126da8342b0a75ca12d9136e90aeb --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/34d9615f-c059-4efc-a51b-da1f8105635f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aa4b0a2e6d34b091d64612ab1eb7c1ab2e0e5a2aeb5348d336cfd1a3ce3d930 +size 57491 diff --git a/.lancedb/nltk_chunking.lance/data/34e45162-918a-4bd7-ab9b-4b82c8f054bb.lance b/.lancedb/nltk_chunking.lance/data/34e45162-918a-4bd7-ab9b-4b82c8f054bb.lance new file mode 100644 index 0000000000000000000000000000000000000000..0ace10020d79be5046fa93161dc86916ec370b46 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/34e45162-918a-4bd7-ab9b-4b82c8f054bb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef8eadb796e8018056a1a8541d3f0d53893afff3742d93b7ce583bbb6352cc15 +size 56332 diff --git a/.lancedb/nltk_chunking.lance/data/34e460ab-c515-4e35-abd0-2f89a29544ca.lance b/.lancedb/nltk_chunking.lance/data/34e460ab-c515-4e35-abd0-2f89a29544ca.lance new file mode 100644 index 0000000000000000000000000000000000000000..5f2e027bad64a22fc64614c581e66f77f75bb54c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/34e460ab-c515-4e35-abd0-2f89a29544ca.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:814235635516b88c7adb1252c734ab4ec98c65a5406e7659c3da1f22386525b2 +size 57226 diff --git a/.lancedb/nltk_chunking.lance/data/351bed14-1701-4852-9b89-76e091f00259.lance b/.lancedb/nltk_chunking.lance/data/351bed14-1701-4852-9b89-76e091f00259.lance new file mode 100644 index 0000000000000000000000000000000000000000..ff69dc9aa25575057adc082d124fad815101deff --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/351bed14-1701-4852-9b89-76e091f00259.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2791bfc18d400ce8eff42afba4f145f9488b2487452240ab2909723e6687bb30 +size 55378 diff --git a/.lancedb/nltk_chunking.lance/data/372db7c8-cefe-4863-8e72-bd82afe88e84.lance b/.lancedb/nltk_chunking.lance/data/372db7c8-cefe-4863-8e72-bd82afe88e84.lance new file mode 100644 index 0000000000000000000000000000000000000000..890ac17c51efd359ca49916258a1bd18de4b6ab3 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/372db7c8-cefe-4863-8e72-bd82afe88e84.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4630260a2572c882057d8888a072787e0181a98d056e8bf9b1fb73080531dff6 +size 53455 diff --git a/.lancedb/nltk_chunking.lance/data/37a6a834-0e48-499d-a22e-fb518b88f02b.lance b/.lancedb/nltk_chunking.lance/data/37a6a834-0e48-499d-a22e-fb518b88f02b.lance new file mode 100644 index 0000000000000000000000000000000000000000..51c3f5d4cd2dae95a44e37a1672b79de857969c0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/37a6a834-0e48-499d-a22e-fb518b88f02b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57974a00a68e870eb3a5040ddde6c05b09cf277744d75d2f95bb9e91e41e891c +size 57031 diff --git a/.lancedb/nltk_chunking.lance/data/38a52f56-b102-4d37-b738-8e87f1d26a4a.lance b/.lancedb/nltk_chunking.lance/data/38a52f56-b102-4d37-b738-8e87f1d26a4a.lance new file mode 100644 index 0000000000000000000000000000000000000000..31d1f90d2c07593a56cb2cb0d66ab9e342ee676f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/38a52f56-b102-4d37-b738-8e87f1d26a4a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77b4d3a9d86d7e57ded97dbfeaa08dbf6c2cc158f9b667e098a3cd8ee1d48b26 +size 53411 diff --git a/.lancedb/nltk_chunking.lance/data/3952d4f8-3b6d-4a48-a450-7c03a7e8e949.lance b/.lancedb/nltk_chunking.lance/data/3952d4f8-3b6d-4a48-a450-7c03a7e8e949.lance new file mode 100644 index 0000000000000000000000000000000000000000..4d184c8d8d5e454deb5101dd7606ee69d173edfc --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3952d4f8-3b6d-4a48-a450-7c03a7e8e949.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:912fd5d203530373166c3971a1174bf4b323839730721eb429f4d0355f2639e6 +size 54101 diff --git a/.lancedb/nltk_chunking.lance/data/39d77b78-832b-41b0-807f-d00b8dd114fc.lance b/.lancedb/nltk_chunking.lance/data/39d77b78-832b-41b0-807f-d00b8dd114fc.lance new file mode 100644 index 0000000000000000000000000000000000000000..00a19ab174a590dd5defaebc182ed5e2b294a1e1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/39d77b78-832b-41b0-807f-d00b8dd114fc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cba8aa79e74b4a850cda9395a819c34818d257896d33f8cd563a5dd4e27f2121 +size 53757 diff --git a/.lancedb/nltk_chunking.lance/data/3afed648-e118-44ec-9393-5770be3ecd86.lance b/.lancedb/nltk_chunking.lance/data/3afed648-e118-44ec-9393-5770be3ecd86.lance new file mode 100644 index 0000000000000000000000000000000000000000..1a43f0a216aa7396f63d25696a00fe7c9927c56b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3afed648-e118-44ec-9393-5770be3ecd86.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94f217d71dade6dbbf23e32863ce5e53bdf2976c562e75b1fd369c34dfb3cf11 +size 54792 diff --git a/.lancedb/nltk_chunking.lance/data/3b5e0ad7-96ae-4f28-bce1-360ed456188a.lance b/.lancedb/nltk_chunking.lance/data/3b5e0ad7-96ae-4f28-bce1-360ed456188a.lance new file mode 100644 index 0000000000000000000000000000000000000000..414d2c665018bbd96cc0781f22e2c2b480c51124 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3b5e0ad7-96ae-4f28-bce1-360ed456188a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c870b511ecb992b9b6682f008e46240283721e5b343a431ce8a5258557f5942 +size 55879 diff --git a/.lancedb/nltk_chunking.lance/data/3bd96657-ff56-4754-aa36-dae35ca48dd8.lance b/.lancedb/nltk_chunking.lance/data/3bd96657-ff56-4754-aa36-dae35ca48dd8.lance new file mode 100644 index 0000000000000000000000000000000000000000..da0561eb82ade4cae985db354692d826610473cc --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3bd96657-ff56-4754-aa36-dae35ca48dd8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c5933497f40d98b57c5800fef64d220901b2c62cffb791f7678448440693d7a +size 57124 diff --git a/.lancedb/nltk_chunking.lance/data/3c2128d5-4169-4fa3-bdd9-e746c742b165.lance b/.lancedb/nltk_chunking.lance/data/3c2128d5-4169-4fa3-bdd9-e746c742b165.lance new file mode 100644 index 0000000000000000000000000000000000000000..59950ef26369084727614adac0ad1d5998cffe29 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3c2128d5-4169-4fa3-bdd9-e746c742b165.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b1c3678efc66f041dc55c2f4b2477dc9da7e905bc6aba6e7999399833adbe45 +size 55489 diff --git a/.lancedb/nltk_chunking.lance/data/3d0bf263-cc92-4df1-aeda-4c42d7fffef9.lance b/.lancedb/nltk_chunking.lance/data/3d0bf263-cc92-4df1-aeda-4c42d7fffef9.lance new file mode 100644 index 0000000000000000000000000000000000000000..625185b7a6cc4e7b40e62a8a74d3d72ee2536172 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3d0bf263-cc92-4df1-aeda-4c42d7fffef9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e007f6a3e3145f604e82bdd384bbfe7aab4e7c72c66fec2f4a1776116b502feb +size 57841 diff --git a/.lancedb/nltk_chunking.lance/data/3ed9db16-93f7-4a6b-afb8-1a14b59e19c3.lance b/.lancedb/nltk_chunking.lance/data/3ed9db16-93f7-4a6b-afb8-1a14b59e19c3.lance new file mode 100644 index 0000000000000000000000000000000000000000..6a9219e8bdf80df288b2397950da975d279d19ce --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3ed9db16-93f7-4a6b-afb8-1a14b59e19c3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7012f5eb4d95db24a7c0d73175bc01bfe022f1fd583c5c242bc20f2a1efa3e3e +size 58724 diff --git a/.lancedb/nltk_chunking.lance/data/3f39ee94-6e0f-41f3-8bc1-2e27ca7fcb93.lance b/.lancedb/nltk_chunking.lance/data/3f39ee94-6e0f-41f3-8bc1-2e27ca7fcb93.lance new file mode 100644 index 0000000000000000000000000000000000000000..cdc9d95effe2e65ca52c1b1d32acf9faaf9b5b89 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/3f39ee94-6e0f-41f3-8bc1-2e27ca7fcb93.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acf59f647f356227912023edf4fa0945d49f37a53f2d250ea9ea7a6bf18a9f1e +size 58905 diff --git a/.lancedb/nltk_chunking.lance/data/402b040c-f60f-457d-a958-6106838ccc80.lance b/.lancedb/nltk_chunking.lance/data/402b040c-f60f-457d-a958-6106838ccc80.lance new file mode 100644 index 0000000000000000000000000000000000000000..78615719c0b0b75b68c621376a11659a5d2ea245 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/402b040c-f60f-457d-a958-6106838ccc80.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfc49d3263f66373e6c99d2816db853d2db60c05773fbaec15fb01f5b34484b +size 55656 diff --git a/.lancedb/nltk_chunking.lance/data/410d04d6-4e07-4c78-975c-b0c9c506f115.lance b/.lancedb/nltk_chunking.lance/data/410d04d6-4e07-4c78-975c-b0c9c506f115.lance new file mode 100644 index 0000000000000000000000000000000000000000..e40e87359aee703a5925952473754a81414a0654 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/410d04d6-4e07-4c78-975c-b0c9c506f115.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e47264b3b57ea1c397f66f8612e5d0047420e30fa366c5ada17bf6e5e20d94d7 +size 58358 diff --git a/.lancedb/nltk_chunking.lance/data/414e9129-a817-4ff8-badc-754894e56c88.lance b/.lancedb/nltk_chunking.lance/data/414e9129-a817-4ff8-badc-754894e56c88.lance new file mode 100644 index 0000000000000000000000000000000000000000..88f379e0c3211687398cb035e661bd1f36b7f6fd --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/414e9129-a817-4ff8-badc-754894e56c88.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f82ac8428a95c8326fbd9bfb193d0680fc55d35bbff142f0838174a21864737b +size 53428 diff --git a/.lancedb/nltk_chunking.lance/data/423ab76e-4e7b-4ca4-a4e6-5d03b16c8519.lance b/.lancedb/nltk_chunking.lance/data/423ab76e-4e7b-4ca4-a4e6-5d03b16c8519.lance new file mode 100644 index 0000000000000000000000000000000000000000..ead5c925457c9310cdaa309dc055263423e05327 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/423ab76e-4e7b-4ca4-a4e6-5d03b16c8519.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f29bdb0e2c26c97ba9361e22ed57d5a2aac52ea04df9886de490e965d71805cd +size 53989 diff --git a/.lancedb/nltk_chunking.lance/data/428ad4e9-1f0b-424b-9956-8ca2a2de5df3.lance b/.lancedb/nltk_chunking.lance/data/428ad4e9-1f0b-424b-9956-8ca2a2de5df3.lance new file mode 100644 index 0000000000000000000000000000000000000000..5e0bb6ab643ce823c3664b0b48c420f3c1b29a1a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/428ad4e9-1f0b-424b-9956-8ca2a2de5df3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae63e6647a7acffce178da8d4c04e7e4a6a2d309ae6747fa90f73b7c75a489d1 +size 54862 diff --git a/.lancedb/nltk_chunking.lance/data/428ffe10-a3f6-446d-80ae-ccdebd053725.lance b/.lancedb/nltk_chunking.lance/data/428ffe10-a3f6-446d-80ae-ccdebd053725.lance new file mode 100644 index 0000000000000000000000000000000000000000..b9b0c2a37b4ac97ac0ef9a0604b352908474cc96 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/428ffe10-a3f6-446d-80ae-ccdebd053725.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f2ce1e8a148f22aefa2aef6de4b9acd43c92a2ef1d051415061708571194b99 +size 55658 diff --git a/.lancedb/nltk_chunking.lance/data/42ed7c52-dccf-4030-a9a0-1a0814ceff85.lance b/.lancedb/nltk_chunking.lance/data/42ed7c52-dccf-4030-a9a0-1a0814ceff85.lance new file mode 100644 index 0000000000000000000000000000000000000000..68a0f9bbf84ffb022358ae3259ce5045ff583c99 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/42ed7c52-dccf-4030-a9a0-1a0814ceff85.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf91ee67b98fd4e9fd3f6051420166678c1243ec7d77fdaf3c423e05887b013f +size 54183 diff --git a/.lancedb/nltk_chunking.lance/data/451d510a-9413-4e61-a33c-ba87c759c124.lance b/.lancedb/nltk_chunking.lance/data/451d510a-9413-4e61-a33c-ba87c759c124.lance new file mode 100644 index 0000000000000000000000000000000000000000..5b19770941f3a9331daa3ab8cd202d918cfd997c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/451d510a-9413-4e61-a33c-ba87c759c124.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c06663bbed550911ff924900fc58a8f401f75e4cd3a8eb0ff408ec7a7580be09 +size 54060 diff --git a/.lancedb/nltk_chunking.lance/data/465a1419-6147-49af-92de-a46ec7f897a7.lance b/.lancedb/nltk_chunking.lance/data/465a1419-6147-49af-92de-a46ec7f897a7.lance new file mode 100644 index 0000000000000000000000000000000000000000..417efc3cf77c0b28c087e6719700a7d2338a3f1a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/465a1419-6147-49af-92de-a46ec7f897a7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b3194eebda595d15f6a0ceede16fcf044312b488d681d798a0d26d7df09cc7d +size 56547 diff --git a/.lancedb/nltk_chunking.lance/data/467f95ed-8f66-43c8-a700-df370c73778b.lance b/.lancedb/nltk_chunking.lance/data/467f95ed-8f66-43c8-a700-df370c73778b.lance new file mode 100644 index 0000000000000000000000000000000000000000..28d705c937a53e37dd0a274aa8c9312468ad7cd5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/467f95ed-8f66-43c8-a700-df370c73778b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb8fe344dde0a19b7861ddef1b4d2a5d86c159d19f66dbf4649652e33c481c8f +size 56974 diff --git a/.lancedb/nltk_chunking.lance/data/468efe39-992e-42e4-b5ff-fec7c4ccec03.lance b/.lancedb/nltk_chunking.lance/data/468efe39-992e-42e4-b5ff-fec7c4ccec03.lance new file mode 100644 index 0000000000000000000000000000000000000000..56375c643153f3933e23adb39a4039ec80880cc7 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/468efe39-992e-42e4-b5ff-fec7c4ccec03.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65f4357d57c16deb30eee03461b38c948d0568b89e20087e1d992233811116fd +size 58717 diff --git a/.lancedb/nltk_chunking.lance/data/47309446-7bd7-4ddc-af07-d987e9a7f21b.lance b/.lancedb/nltk_chunking.lance/data/47309446-7bd7-4ddc-af07-d987e9a7f21b.lance new file mode 100644 index 0000000000000000000000000000000000000000..8a005b45f1809d90c1db6c1892a25fb65fb8720a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/47309446-7bd7-4ddc-af07-d987e9a7f21b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b16cc087dc5900fc9c751e8b7480b76d835eb426a41e5a56e9f1b1f54e5c32a +size 55840 diff --git a/.lancedb/nltk_chunking.lance/data/487cd590-ec2d-43ef-860a-5b45b8f976fa.lance b/.lancedb/nltk_chunking.lance/data/487cd590-ec2d-43ef-860a-5b45b8f976fa.lance new file mode 100644 index 0000000000000000000000000000000000000000..7901e5abe4bb21a71044bb025bf9d7ab498f5c55 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/487cd590-ec2d-43ef-860a-5b45b8f976fa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:957a375ee5b49c7cf7286b1c7055297d0566615c91c13df0b04280aab24b20db +size 57027 diff --git a/.lancedb/nltk_chunking.lance/data/48a64a51-2df4-4d6b-8267-1dee5906103d.lance b/.lancedb/nltk_chunking.lance/data/48a64a51-2df4-4d6b-8267-1dee5906103d.lance new file mode 100644 index 0000000000000000000000000000000000000000..b622529d9010f61ac68de69a6729b39f33dff5e4 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/48a64a51-2df4-4d6b-8267-1dee5906103d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e6de9e096b1733780e35fa98f43d1ca01bf2f52b3626cd19aa2bf8d4b58d4c9 +size 55219 diff --git a/.lancedb/nltk_chunking.lance/data/4940a642-b695-485f-a395-16e1bd28f41a.lance b/.lancedb/nltk_chunking.lance/data/4940a642-b695-485f-a395-16e1bd28f41a.lance new file mode 100644 index 0000000000000000000000000000000000000000..70da6e759bf96aec0c0d315648ef7e4afca853f0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/4940a642-b695-485f-a395-16e1bd28f41a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b70f1afbfd36b84bd9e13f4d0ab6605f127a9d5218fe2439a532a70def5c4ab +size 55351 diff --git a/.lancedb/nltk_chunking.lance/data/49431907-c5da-4ffe-8f20-f8d7556cb1c3.lance b/.lancedb/nltk_chunking.lance/data/49431907-c5da-4ffe-8f20-f8d7556cb1c3.lance new file mode 100644 index 0000000000000000000000000000000000000000..2e3f8cf393943fd4f46b78f3941532554bcc4019 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/49431907-c5da-4ffe-8f20-f8d7556cb1c3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0317ac8a5d62b81230618840e41e5ced2ea1c5f1e163344c2633c0f20ed40f90 +size 55984 diff --git a/.lancedb/nltk_chunking.lance/data/4980ba76-5f34-4068-9cdd-37a395ac0816.lance b/.lancedb/nltk_chunking.lance/data/4980ba76-5f34-4068-9cdd-37a395ac0816.lance new file mode 100644 index 0000000000000000000000000000000000000000..5c53984600c3d67cfa751a3b2326d982f1267c46 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/4980ba76-5f34-4068-9cdd-37a395ac0816.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1de33e1b99ea2c08b98738e7b6c4b28db5fbe1f2897f92ac86df701a0cc8b04 +size 57561 diff --git a/.lancedb/nltk_chunking.lance/data/4991f1bf-90d5-466f-96fb-a607f0b7b95f.lance b/.lancedb/nltk_chunking.lance/data/4991f1bf-90d5-466f-96fb-a607f0b7b95f.lance new file mode 100644 index 0000000000000000000000000000000000000000..9b4d2bc8f477746056ac6627e26cb61598124c6d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/4991f1bf-90d5-466f-96fb-a607f0b7b95f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2b0aa97a9245229263ae97979d2c5b11815b34532d56c59cf23308aefccba15 +size 59259 diff --git a/.lancedb/nltk_chunking.lance/data/49afd28c-4898-4943-8e8d-d64ac32d930d.lance b/.lancedb/nltk_chunking.lance/data/49afd28c-4898-4943-8e8d-d64ac32d930d.lance new file mode 100644 index 0000000000000000000000000000000000000000..c268c7d5992723662d9f5b8d347625d22be646e1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/49afd28c-4898-4943-8e8d-d64ac32d930d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:703216ca7305dc6b5bdadcec17b2a5e03afaedf0a26c960e65cda1b8433a469b +size 53724 diff --git a/.lancedb/nltk_chunking.lance/data/4a9c71ec-6ae0-441b-8fad-df4c36ae9fcf.lance b/.lancedb/nltk_chunking.lance/data/4a9c71ec-6ae0-441b-8fad-df4c36ae9fcf.lance new file mode 100644 index 0000000000000000000000000000000000000000..46ac05e2b76e5783ebad2190070cbb61b149db14 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/4a9c71ec-6ae0-441b-8fad-df4c36ae9fcf.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20c167c5f7f49114f3dd830042167719b72e23e7e1ebeacdf13a87cde12d2f4c +size 54390 diff --git a/.lancedb/nltk_chunking.lance/data/4ac26fc9-3120-4620-ae68-1c077d6fbd06.lance b/.lancedb/nltk_chunking.lance/data/4ac26fc9-3120-4620-ae68-1c077d6fbd06.lance new file mode 100644 index 0000000000000000000000000000000000000000..61a8cbe60414197ebbeb4bc7f6ff9e82ca35c10a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/4ac26fc9-3120-4620-ae68-1c077d6fbd06.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6100229e41e5c9a5638d3317034a89dd32c4f0057922f307d3f6480806425648 +size 55667 diff --git a/.lancedb/nltk_chunking.lance/data/4ac51626-ec9b-4753-bcb1-19cd9ae5d0b9.lance b/.lancedb/nltk_chunking.lance/data/4ac51626-ec9b-4753-bcb1-19cd9ae5d0b9.lance new file mode 100644 index 0000000000000000000000000000000000000000..c6dd55dfdd8882c05eb3610bc123f7c888bc5ae3 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/4ac51626-ec9b-4753-bcb1-19cd9ae5d0b9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76039b586f86e160fde2f747436c24d1d9e6c180a59e97e321897fc88c1ba17c +size 53768 diff --git a/.lancedb/nltk_chunking.lance/data/4c1432f9-2b1a-45bc-b78b-821dbb674f9f.lance b/.lancedb/nltk_chunking.lance/data/4c1432f9-2b1a-45bc-b78b-821dbb674f9f.lance new file mode 100644 index 0000000000000000000000000000000000000000..2b51f6c415d506f8fd6c38f0f3538552a56c124b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/4c1432f9-2b1a-45bc-b78b-821dbb674f9f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a72c001ba2b2ac1e5c2eaf5b6d0a9ea2db1aa48aabe6b44a8a67b03ce92d56b3 +size 54729 diff --git a/.lancedb/nltk_chunking.lance/data/4c809e47-513e-4076-aa37-f10262b99fff.lance b/.lancedb/nltk_chunking.lance/data/4c809e47-513e-4076-aa37-f10262b99fff.lance new file mode 100644 index 0000000000000000000000000000000000000000..cb073ec4f5c2faa077e1d550f224b4c0e9808755 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/4c809e47-513e-4076-aa37-f10262b99fff.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:296a7ed6eca52408076e50566b111ebc44e39b96e78513789f8e2ebbb763855e +size 53949 diff --git a/.lancedb/nltk_chunking.lance/data/4cc1f5a7-9c30-4cb7-8005-eccdd62be1b7.lance b/.lancedb/nltk_chunking.lance/data/4cc1f5a7-9c30-4cb7-8005-eccdd62be1b7.lance new file mode 100644 index 0000000000000000000000000000000000000000..2a00b2b17afb6f23f725591d92d4c739f5547391 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/4cc1f5a7-9c30-4cb7-8005-eccdd62be1b7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:624183e3ccf3a3d76c724f106f3b1753e8560841395bb2d5ed0d0a11dbee255b +size 54429 diff --git a/.lancedb/nltk_chunking.lance/data/4d5c305c-330e-45af-aa0c-7faccf605ec7.lance b/.lancedb/nltk_chunking.lance/data/4d5c305c-330e-45af-aa0c-7faccf605ec7.lance new file mode 100644 index 0000000000000000000000000000000000000000..ba925bfff63a9044834fb16ceaa3e290b2a1e36d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/4d5c305c-330e-45af-aa0c-7faccf605ec7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f736572b9ae32026995fe7c55569b0dc0d0256ea5a7566c823e2c53032def68 +size 56000 diff --git a/.lancedb/nltk_chunking.lance/data/4e3d5132-8bf6-4a5f-b42b-471c1c1aaf6a.lance b/.lancedb/nltk_chunking.lance/data/4e3d5132-8bf6-4a5f-b42b-471c1c1aaf6a.lance new file mode 100644 index 0000000000000000000000000000000000000000..97eb25fad38a1e32d54b892fd38b966197b6c061 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/4e3d5132-8bf6-4a5f-b42b-471c1c1aaf6a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff3b52317bb8c1e29ddda08912c4ca01bc07861797bbc4ef65ec15c214af0e24 +size 54969 diff --git a/.lancedb/nltk_chunking.lance/data/5031311a-0d0a-464f-acf6-99221fa3ef18.lance b/.lancedb/nltk_chunking.lance/data/5031311a-0d0a-464f-acf6-99221fa3ef18.lance new file mode 100644 index 0000000000000000000000000000000000000000..3d64029ffddf2c062693d9972a91a045883222ee --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5031311a-0d0a-464f-acf6-99221fa3ef18.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67b057177bffe9558b473dac13363cee52af1ee5509caf7745f679bc20e14711 +size 58014 diff --git a/.lancedb/nltk_chunking.lance/data/5047a361-365e-48f7-8923-77e35ce592e8.lance b/.lancedb/nltk_chunking.lance/data/5047a361-365e-48f7-8923-77e35ce592e8.lance new file mode 100644 index 0000000000000000000000000000000000000000..e95369516e42abe03eab390f801b07db343b772c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5047a361-365e-48f7-8923-77e35ce592e8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42615bc5af33b213e582cc232f6b3f3c995e44fd96aea1039b8346d21038ac84 +size 55919 diff --git a/.lancedb/nltk_chunking.lance/data/51f1cee4-18c3-4ee3-904a-09291a70e1b0.lance b/.lancedb/nltk_chunking.lance/data/51f1cee4-18c3-4ee3-904a-09291a70e1b0.lance new file mode 100644 index 0000000000000000000000000000000000000000..3b7d2939f0ca5ffd11b4afb38c75f47ff1bcc82c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/51f1cee4-18c3-4ee3-904a-09291a70e1b0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3d4ee706f17c127a62fb679091a8e12a06cbbd1aa973f262680fd96f7f81c04 +size 55758 diff --git a/.lancedb/nltk_chunking.lance/data/523eaab4-1b4f-4e06-af30-e1ff8d071f20.lance b/.lancedb/nltk_chunking.lance/data/523eaab4-1b4f-4e06-af30-e1ff8d071f20.lance new file mode 100644 index 0000000000000000000000000000000000000000..f2e23533ee4b183b59ad9564168303746a1faf24 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/523eaab4-1b4f-4e06-af30-e1ff8d071f20.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eeacea9a89d33162e0335298da70ac9b058a4d8e594b785515d64b0a540b5adc +size 57413 diff --git a/.lancedb/nltk_chunking.lance/data/52a358ad-3573-4abb-a640-8ab87c4cefff.lance b/.lancedb/nltk_chunking.lance/data/52a358ad-3573-4abb-a640-8ab87c4cefff.lance new file mode 100644 index 0000000000000000000000000000000000000000..420adbb1ef064da06d6e1bebb8ec91cbc40a6810 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/52a358ad-3573-4abb-a640-8ab87c4cefff.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc9d0c5d9dd6cf1f9a46f38c6d202cde189adecfcb5345cb1e48884c224d9190 +size 57088 diff --git a/.lancedb/nltk_chunking.lance/data/52a4d3eb-07d3-41e9-81ef-9a458c5bbb02.lance b/.lancedb/nltk_chunking.lance/data/52a4d3eb-07d3-41e9-81ef-9a458c5bbb02.lance new file mode 100644 index 0000000000000000000000000000000000000000..9914ef07e41a14d1d4c541a5a0b7d933f2824a69 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/52a4d3eb-07d3-41e9-81ef-9a458c5bbb02.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc0de8c58738ba4b1c32add243664fb24b2c0eec42d3a38d386c13649db3b5ca +size 56317 diff --git a/.lancedb/nltk_chunking.lance/data/53d74ad7-c7cd-467b-86f8-a3472a8a683e.lance b/.lancedb/nltk_chunking.lance/data/53d74ad7-c7cd-467b-86f8-a3472a8a683e.lance new file mode 100644 index 0000000000000000000000000000000000000000..27ba0fa27844f65eccf9a9b9b71f3f1505a87641 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/53d74ad7-c7cd-467b-86f8-a3472a8a683e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26b5f8066b9e0dcce4693d5ca85d11d10d4332759573aee895ce2f087d964542 +size 52857 diff --git a/.lancedb/nltk_chunking.lance/data/5423a9b4-0eb7-4e2a-a0c5-7ce8b791923d.lance b/.lancedb/nltk_chunking.lance/data/5423a9b4-0eb7-4e2a-a0c5-7ce8b791923d.lance new file mode 100644 index 0000000000000000000000000000000000000000..f9ec6a2a03ec54c22b2926c8d4f67e0e72d8611d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5423a9b4-0eb7-4e2a-a0c5-7ce8b791923d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:691c840b4975b9942c8ea101e3ef02f4ac82e386182a848a0952258206067dc9 +size 55634 diff --git a/.lancedb/nltk_chunking.lance/data/5452bbdb-f113-4118-8fe4-1cdb82f544c2.lance b/.lancedb/nltk_chunking.lance/data/5452bbdb-f113-4118-8fe4-1cdb82f544c2.lance new file mode 100644 index 0000000000000000000000000000000000000000..cd4ba369525ec8370d708041839e5c0164e7c100 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5452bbdb-f113-4118-8fe4-1cdb82f544c2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:245b169a2c8b8db8cd5f7f41165d4487b4f3822296c65fb986a7d7cbb14f643e +size 55871 diff --git a/.lancedb/nltk_chunking.lance/data/547e782d-ed59-4842-bbea-d0be6f620a25.lance b/.lancedb/nltk_chunking.lance/data/547e782d-ed59-4842-bbea-d0be6f620a25.lance new file mode 100644 index 0000000000000000000000000000000000000000..783265b0ccdd47d1507c676b347fa8d182cf2cbe --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/547e782d-ed59-4842-bbea-d0be6f620a25.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd8bc7f860f779761334e1358cb08f589168eba41fa24642492d152f4339deae +size 54124 diff --git a/.lancedb/nltk_chunking.lance/data/554a4270-4a7e-46f9-a30a-389021765f57.lance b/.lancedb/nltk_chunking.lance/data/554a4270-4a7e-46f9-a30a-389021765f57.lance new file mode 100644 index 0000000000000000000000000000000000000000..0e056b80b9d0305b54f6e9bdfe02661f148541f0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/554a4270-4a7e-46f9-a30a-389021765f57.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:436474a8c254a660e1469b233290afd944137c33202fd0aef8b4ab57ab79eb17 +size 56023 diff --git a/.lancedb/nltk_chunking.lance/data/5609bffb-b3ea-492b-9553-63fb452834d5.lance b/.lancedb/nltk_chunking.lance/data/5609bffb-b3ea-492b-9553-63fb452834d5.lance new file mode 100644 index 0000000000000000000000000000000000000000..b72bbe94abb16618d7839c71e4ed84e456bf1900 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5609bffb-b3ea-492b-9553-63fb452834d5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9858ea67ce5024976312f3439856e0b982c9074a9d0135e699aaa765d187e57b +size 61615 diff --git a/.lancedb/nltk_chunking.lance/data/5826b7ae-3e76-40e4-a11a-942ee68aab3a.lance b/.lancedb/nltk_chunking.lance/data/5826b7ae-3e76-40e4-a11a-942ee68aab3a.lance new file mode 100644 index 0000000000000000000000000000000000000000..3891637c84ec3d11ab845595288b5877cc1d8bc6 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5826b7ae-3e76-40e4-a11a-942ee68aab3a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28f4239297602ee64fef919ee7143b516da70ba89eb1010085226e8d73b24552 +size 56735 diff --git a/.lancedb/nltk_chunking.lance/data/5852270a-f73d-4569-b445-32c4ae974f30.lance b/.lancedb/nltk_chunking.lance/data/5852270a-f73d-4569-b445-32c4ae974f30.lance new file mode 100644 index 0000000000000000000000000000000000000000..0663791dc8bdd5b51863b34c3c2c7af964854f47 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5852270a-f73d-4569-b445-32c4ae974f30.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:879a3f2ddb86603a26f97818c1ec5d8176b5fb75e665ac8dba7156773f70e5a5 +size 57037 diff --git a/.lancedb/nltk_chunking.lance/data/5868a1cb-d079-4ef7-8f7f-b2199f229cab.lance b/.lancedb/nltk_chunking.lance/data/5868a1cb-d079-4ef7-8f7f-b2199f229cab.lance new file mode 100644 index 0000000000000000000000000000000000000000..4ae166d6609dbd6ef73201d4c2d92b7ef29adaf9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5868a1cb-d079-4ef7-8f7f-b2199f229cab.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08de875387cd63289c5f6d6e6a7744d36e3531014a36f01535b8576efeca2034 +size 55542 diff --git a/.lancedb/nltk_chunking.lance/data/5a0d55fc-9f40-4c2c-8765-8ea1d39ec0b1.lance b/.lancedb/nltk_chunking.lance/data/5a0d55fc-9f40-4c2c-8765-8ea1d39ec0b1.lance new file mode 100644 index 0000000000000000000000000000000000000000..0c244720ee6071c94b42507e75281332f9524c4b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5a0d55fc-9f40-4c2c-8765-8ea1d39ec0b1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b430661edf1fd8a172a9d0aaf44f1171952cffe616259aa043c980e264d95bf8 +size 58954 diff --git a/.lancedb/nltk_chunking.lance/data/5bd5e614-c444-4204-9f18-fd4fe7dcbcf4.lance b/.lancedb/nltk_chunking.lance/data/5bd5e614-c444-4204-9f18-fd4fe7dcbcf4.lance new file mode 100644 index 0000000000000000000000000000000000000000..0f8eab134dfc587049cda7ede07a6ef735f4b8d1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5bd5e614-c444-4204-9f18-fd4fe7dcbcf4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ebc7538b4fe4b6d7a99d78f46deb830c4907bdcf2c481b9260486dee1427bf0 +size 57311 diff --git a/.lancedb/nltk_chunking.lance/data/5c9c7f25-1501-4c06-97b1-5683127309f7.lance b/.lancedb/nltk_chunking.lance/data/5c9c7f25-1501-4c06-97b1-5683127309f7.lance new file mode 100644 index 0000000000000000000000000000000000000000..23242696d1cade63b0ea903c689b24c91618ed1b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5c9c7f25-1501-4c06-97b1-5683127309f7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f63defa64d08fc6cac2667c9a153770928efdd9b2920c4da86aabf358a0b24e +size 56118 diff --git a/.lancedb/nltk_chunking.lance/data/5d400181-0c9e-42ff-87e8-a1430cbc6f2e.lance b/.lancedb/nltk_chunking.lance/data/5d400181-0c9e-42ff-87e8-a1430cbc6f2e.lance new file mode 100644 index 0000000000000000000000000000000000000000..5b434cdd4b6776450161a611e063c7974525f71a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5d400181-0c9e-42ff-87e8-a1430cbc6f2e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7daec9d4eaf2b6e505bc7c7d6fc8ccef8bbf92d36d0af32729ff443a1eaae15 +size 55594 diff --git a/.lancedb/nltk_chunking.lance/data/5ec28147-482c-491c-8c00-491c9bb55aa2.lance b/.lancedb/nltk_chunking.lance/data/5ec28147-482c-491c-8c00-491c9bb55aa2.lance new file mode 100644 index 0000000000000000000000000000000000000000..8aa4550017ef5ebeb34b1e7ad00f5feac32b6ef3 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5ec28147-482c-491c-8c00-491c9bb55aa2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f14baadc2b7495cd61bd4aca347be938cb874cb2ce1d9b8e169d974cbd413675 +size 56196 diff --git a/.lancedb/nltk_chunking.lance/data/5f421b62-ce25-4408-a9e8-3d3bc969de1d.lance b/.lancedb/nltk_chunking.lance/data/5f421b62-ce25-4408-a9e8-3d3bc969de1d.lance new file mode 100644 index 0000000000000000000000000000000000000000..3bb55260ddcf44f0b28667e8f5cf0f15be318a56 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5f421b62-ce25-4408-a9e8-3d3bc969de1d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56c189dd46203e8e5a355824a463f88a30b10dd5b014b813bd8f951cc0cbac7b +size 56522 diff --git a/.lancedb/nltk_chunking.lance/data/5ffc8889-6ea0-49d2-8354-8c18f154c8fb.lance b/.lancedb/nltk_chunking.lance/data/5ffc8889-6ea0-49d2-8354-8c18f154c8fb.lance new file mode 100644 index 0000000000000000000000000000000000000000..1ca4d66bce6e63fc1c84bbe5e9f5f3af72869813 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/5ffc8889-6ea0-49d2-8354-8c18f154c8fb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1922b90abb45fd5bbaf3401b26c338907aaeb2f9dfad27e0414eba68a265c30d +size 55665 diff --git a/.lancedb/nltk_chunking.lance/data/60d1b2f2-2dc0-406b-b7f6-50a91a1dbf27.lance b/.lancedb/nltk_chunking.lance/data/60d1b2f2-2dc0-406b-b7f6-50a91a1dbf27.lance new file mode 100644 index 0000000000000000000000000000000000000000..15a390d28d2006b5cd8dd93afa5e2b5b8fed5f31 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/60d1b2f2-2dc0-406b-b7f6-50a91a1dbf27.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a51107f4b81edfedba8ea07287dc75f284b997b49b61cd2558e840d004ae098 +size 54766 diff --git a/.lancedb/nltk_chunking.lance/data/622ef2f6-0b4b-4c3a-9520-bb61b83ad74f.lance b/.lancedb/nltk_chunking.lance/data/622ef2f6-0b4b-4c3a-9520-bb61b83ad74f.lance new file mode 100644 index 0000000000000000000000000000000000000000..88f84cb3d35df48a0e718e0043a8e1ee6b4eaf91 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/622ef2f6-0b4b-4c3a-9520-bb61b83ad74f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a5d0710b7a7a13f0cc27c2028b396909911371b4ad9138a89c068abf8b0289c +size 54768 diff --git a/.lancedb/nltk_chunking.lance/data/63a4fa66-1d36-4024-8750-2853ebe83bdb.lance b/.lancedb/nltk_chunking.lance/data/63a4fa66-1d36-4024-8750-2853ebe83bdb.lance new file mode 100644 index 0000000000000000000000000000000000000000..83f7a05b55a2a0789214978f394a0e9018751a10 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/63a4fa66-1d36-4024-8750-2853ebe83bdb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db50e32ffc7bb9ecff7c6239c821d4f23deb23e532653a97cfb09709c80ca91b +size 55829 diff --git a/.lancedb/nltk_chunking.lance/data/63d06d78-b9d5-4c3e-bd71-3ec525580218.lance b/.lancedb/nltk_chunking.lance/data/63d06d78-b9d5-4c3e-bd71-3ec525580218.lance new file mode 100644 index 0000000000000000000000000000000000000000..37d0070c64bf710786d08a5c5ed3171d68de3cc0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/63d06d78-b9d5-4c3e-bd71-3ec525580218.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf26df9819c6b3f17ee0dc6db28f93aef3532e06a34bb264bd52ecb277040b2d +size 56772 diff --git a/.lancedb/nltk_chunking.lance/data/64158f1c-c2e4-487f-a785-6df895c25e76.lance b/.lancedb/nltk_chunking.lance/data/64158f1c-c2e4-487f-a785-6df895c25e76.lance new file mode 100644 index 0000000000000000000000000000000000000000..c76a83dd21044f402465cbfbedb3664b015ba3d7 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/64158f1c-c2e4-487f-a785-6df895c25e76.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15fed92427bdd8cc43cb2cd8b61e5b85c4b88aaa1c3b443f25c6269e419b3c60 +size 55659 diff --git a/.lancedb/nltk_chunking.lance/data/64daac75-69fb-49c6-8282-a57921a64c6a.lance b/.lancedb/nltk_chunking.lance/data/64daac75-69fb-49c6-8282-a57921a64c6a.lance new file mode 100644 index 0000000000000000000000000000000000000000..a85a96c4c4ae4f438cb75071a5f5d352a1914b13 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/64daac75-69fb-49c6-8282-a57921a64c6a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88819ed611b10071b650ebf37d64e5035b9ab7d123f642d75dfc23dc6b192e56 +size 55967 diff --git a/.lancedb/nltk_chunking.lance/data/651b68fb-ee6b-41dd-b773-76bbf4da8705.lance b/.lancedb/nltk_chunking.lance/data/651b68fb-ee6b-41dd-b773-76bbf4da8705.lance new file mode 100644 index 0000000000000000000000000000000000000000..34ec2b53156226d70f059b8b9c682e7c792f5db9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/651b68fb-ee6b-41dd-b773-76bbf4da8705.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a86a37425a3b8b70f3d09eaad7944f5f37cbdbe2fea1205b7ab44554a044decd +size 57395 diff --git a/.lancedb/nltk_chunking.lance/data/66958ca8-4f48-43ac-8e5b-4515fa9a1520.lance b/.lancedb/nltk_chunking.lance/data/66958ca8-4f48-43ac-8e5b-4515fa9a1520.lance new file mode 100644 index 0000000000000000000000000000000000000000..8939e79c7a5c724becc87a37c5dc001673993e48 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/66958ca8-4f48-43ac-8e5b-4515fa9a1520.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:267ce472711f9be135474e08693a3bbba16583782e7f2486b0a570dd0fd5474e +size 54297 diff --git a/.lancedb/nltk_chunking.lance/data/6747ef2e-a20e-4308-842d-59dffc676e88.lance b/.lancedb/nltk_chunking.lance/data/6747ef2e-a20e-4308-842d-59dffc676e88.lance new file mode 100644 index 0000000000000000000000000000000000000000..4df169a47082a274b62fcb00edcb31730b25ce2b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6747ef2e-a20e-4308-842d-59dffc676e88.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33f025c00ec78b18bc9efbf70de257b9513e7728cd2f71de2d2c67a30ca75209 +size 54894 diff --git a/.lancedb/nltk_chunking.lance/data/67d4bcca-04aa-4301-bcdc-9965a4a6e6b1.lance b/.lancedb/nltk_chunking.lance/data/67d4bcca-04aa-4301-bcdc-9965a4a6e6b1.lance new file mode 100644 index 0000000000000000000000000000000000000000..814770249ed4085c5c4a639057d534e2a90a784f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/67d4bcca-04aa-4301-bcdc-9965a4a6e6b1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2c80c2f99c5aa72237d0253ee04ab4c679a77c9d1bfd81e301709cd69c1097b +size 54345 diff --git a/.lancedb/nltk_chunking.lance/data/6845949d-7ca1-48bb-9eac-edfe3845bb9c.lance b/.lancedb/nltk_chunking.lance/data/6845949d-7ca1-48bb-9eac-edfe3845bb9c.lance new file mode 100644 index 0000000000000000000000000000000000000000..e6e80485fc8770ab8c3d1eda7f6fd7b869b46ba5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6845949d-7ca1-48bb-9eac-edfe3845bb9c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:567dcbff7af86884a611c9ed2a99986bda8f6eea87105a0240e631d675e9e600 +size 54966 diff --git a/.lancedb/nltk_chunking.lance/data/6848beb6-4857-4240-8077-6e620dea2822.lance b/.lancedb/nltk_chunking.lance/data/6848beb6-4857-4240-8077-6e620dea2822.lance new file mode 100644 index 0000000000000000000000000000000000000000..63c1c505ea220b9abcc159e0d6a0ea8cc963dd52 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6848beb6-4857-4240-8077-6e620dea2822.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4547dac4127c11e1b207eede6f04d7f2ee0ac9ae05c3d9c71de40c2773be9bfd +size 57873 diff --git a/.lancedb/nltk_chunking.lance/data/6920ca08-f5e3-4ff4-b04a-7cbb0a6c78c4.lance b/.lancedb/nltk_chunking.lance/data/6920ca08-f5e3-4ff4-b04a-7cbb0a6c78c4.lance new file mode 100644 index 0000000000000000000000000000000000000000..7f6fde719c2c5a64e9fec3057b9ea24d94e6ae65 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6920ca08-f5e3-4ff4-b04a-7cbb0a6c78c4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f393fc26dff031da7eb455c15627a032cd40b821f48df7647c67b909bcefd8da +size 61828 diff --git a/.lancedb/nltk_chunking.lance/data/692123a7-d126-4961-a888-f3494725aa49.lance b/.lancedb/nltk_chunking.lance/data/692123a7-d126-4961-a888-f3494725aa49.lance new file mode 100644 index 0000000000000000000000000000000000000000..6438d003cce80a8fd805c90068a9265699df0796 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/692123a7-d126-4961-a888-f3494725aa49.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:faf4add290651c4ff2bbe74c03eb3942f89ee078ddeaa8f053dd8c9341c2c0c8 +size 53932 diff --git a/.lancedb/nltk_chunking.lance/data/69b72bef-2e0b-4406-a5a2-a380af720d9e.lance b/.lancedb/nltk_chunking.lance/data/69b72bef-2e0b-4406-a5a2-a380af720d9e.lance new file mode 100644 index 0000000000000000000000000000000000000000..bf4187af72a24cc274a26fbe90dc740383aa1829 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/69b72bef-2e0b-4406-a5a2-a380af720d9e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fb2deb98f8acd014a6be4e6f3ab0627176462d3151ecd2c7c9e01e4e573c279 +size 66480 diff --git a/.lancedb/nltk_chunking.lance/data/6a0ec041-5728-4785-8425-1642895d1f06.lance b/.lancedb/nltk_chunking.lance/data/6a0ec041-5728-4785-8425-1642895d1f06.lance new file mode 100644 index 0000000000000000000000000000000000000000..5cd67424096779c977dd69a906728922fc744fb9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6a0ec041-5728-4785-8425-1642895d1f06.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af877f7a555336512ad0974821d4fd3816509522474113c12358a295e1231f31 +size 55523 diff --git a/.lancedb/nltk_chunking.lance/data/6a529ae3-66b1-43a3-96ba-bbdd417fa2fb.lance b/.lancedb/nltk_chunking.lance/data/6a529ae3-66b1-43a3-96ba-bbdd417fa2fb.lance new file mode 100644 index 0000000000000000000000000000000000000000..f26ef183f8d33722b0afee8b4a5df201db377f3c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6a529ae3-66b1-43a3-96ba-bbdd417fa2fb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32cd221b6a297f4d313b82cebc539541d83b273ad00faf7f0ec78600bd03cf75 +size 53858 diff --git a/.lancedb/nltk_chunking.lance/data/6bfc834d-5fff-4c18-9d8c-4a937e7e9d32.lance b/.lancedb/nltk_chunking.lance/data/6bfc834d-5fff-4c18-9d8c-4a937e7e9d32.lance new file mode 100644 index 0000000000000000000000000000000000000000..150afbf85e316370e27852f00711417514a40236 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6bfc834d-5fff-4c18-9d8c-4a937e7e9d32.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd59eeb68909539fc5fb305b4305beb34dc84deadb22a57f49faaf887d92712f +size 54362 diff --git a/.lancedb/nltk_chunking.lance/data/6c9b50db-f394-4cc8-9518-bf60cff444ca.lance b/.lancedb/nltk_chunking.lance/data/6c9b50db-f394-4cc8-9518-bf60cff444ca.lance new file mode 100644 index 0000000000000000000000000000000000000000..0a5c4045f4811202ee6d18ac2a061592b1c7c044 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6c9b50db-f394-4cc8-9518-bf60cff444ca.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5f89b389c8581a5f2125edda5d52bf5282d5f9e034fbaafa458ac191a73099b +size 54727 diff --git a/.lancedb/nltk_chunking.lance/data/6d1d1123-59ba-426d-bd0f-715cec0aa293.lance b/.lancedb/nltk_chunking.lance/data/6d1d1123-59ba-426d-bd0f-715cec0aa293.lance new file mode 100644 index 0000000000000000000000000000000000000000..0262875ff291f5a042325c003218dff9666e3c5a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6d1d1123-59ba-426d-bd0f-715cec0aa293.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d63a9d142e3d387083c334cdbc32eacc6040f40d2ea0e563056a2b9c1ef42f0e +size 54770 diff --git a/.lancedb/nltk_chunking.lance/data/6d2740fe-aadf-41ae-b6d5-c7dd38ae045c.lance b/.lancedb/nltk_chunking.lance/data/6d2740fe-aadf-41ae-b6d5-c7dd38ae045c.lance new file mode 100644 index 0000000000000000000000000000000000000000..789a22245af5b5ebf059ff49186265019627a81d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6d2740fe-aadf-41ae-b6d5-c7dd38ae045c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b460d3fd8c94d8f671454b5b75b41a786469a43c7766dbd91e374e1410b3ef02 +size 56774 diff --git a/.lancedb/nltk_chunking.lance/data/6defb6e8-9313-4847-b25e-5fa1e12b79d9.lance b/.lancedb/nltk_chunking.lance/data/6defb6e8-9313-4847-b25e-5fa1e12b79d9.lance new file mode 100644 index 0000000000000000000000000000000000000000..0e91888b6a3b2b017e31a29abbb7486cfafbc30c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6defb6e8-9313-4847-b25e-5fa1e12b79d9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9878322a290c13e70e77bd885f4322e61f62b8a8b9f5b05bd096bcd1456c62bd +size 54236 diff --git a/.lancedb/nltk_chunking.lance/data/6e2e5b84-372f-4acf-8912-1d65007f5ec8.lance b/.lancedb/nltk_chunking.lance/data/6e2e5b84-372f-4acf-8912-1d65007f5ec8.lance new file mode 100644 index 0000000000000000000000000000000000000000..2cf7838354aec397cc24fcc5f2b34a6d61d96083 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6e2e5b84-372f-4acf-8912-1d65007f5ec8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dda1fc5c394db10f017c0254ba35072e58d53df37bc2e273ffeeebe83ac38c0 +size 53144 diff --git a/.lancedb/nltk_chunking.lance/data/6e53a411-703b-465d-815b-000146ef5e5e.lance b/.lancedb/nltk_chunking.lance/data/6e53a411-703b-465d-815b-000146ef5e5e.lance new file mode 100644 index 0000000000000000000000000000000000000000..f0407dde91983ff1276f99e036bb98bb56a5ccc5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6e53a411-703b-465d-815b-000146ef5e5e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:894b76ea03185e8ad5ea832f49c7a59bd6899d53ea1be9c932727a777ebb4a65 +size 53888 diff --git a/.lancedb/nltk_chunking.lance/data/6e6ca3af-f267-42cf-9cf4-a441f7aef0ab.lance b/.lancedb/nltk_chunking.lance/data/6e6ca3af-f267-42cf-9cf4-a441f7aef0ab.lance new file mode 100644 index 0000000000000000000000000000000000000000..d7f8fe9596f9a6898a23745a5541ae42e5ec00e9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/6e6ca3af-f267-42cf-9cf4-a441f7aef0ab.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bfe274075da1a2b785e0cae9b6c4893327c273e3fa11c2976413315561ad71c +size 54734 diff --git a/.lancedb/nltk_chunking.lance/data/70fff877-6e4c-47fb-bb39-13e832919370.lance b/.lancedb/nltk_chunking.lance/data/70fff877-6e4c-47fb-bb39-13e832919370.lance new file mode 100644 index 0000000000000000000000000000000000000000..440ad9e03c8ee19332e5cbe8aa0379a1683d6845 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/70fff877-6e4c-47fb-bb39-13e832919370.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cddac73ee4510cc779556cf3720e45eeb702b4def43a0fcd0c6927d92a123742 +size 56810 diff --git a/.lancedb/nltk_chunking.lance/data/715b654f-f8c8-4e0d-a19e-9f4fe2f05ced.lance b/.lancedb/nltk_chunking.lance/data/715b654f-f8c8-4e0d-a19e-9f4fe2f05ced.lance new file mode 100644 index 0000000000000000000000000000000000000000..b6ac5a16409b3fefe86bc36664d860935b290015 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/715b654f-f8c8-4e0d-a19e-9f4fe2f05ced.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25467edd431b2b810f736c61b758fde538db0f77a26822717cf297bed6d22f04 +size 56196 diff --git a/.lancedb/nltk_chunking.lance/data/719f578c-9d49-4d06-9d42-87bde9c5db16.lance b/.lancedb/nltk_chunking.lance/data/719f578c-9d49-4d06-9d42-87bde9c5db16.lance new file mode 100644 index 0000000000000000000000000000000000000000..95b3254059ca519f9a85dd7be5e273265977a1cb --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/719f578c-9d49-4d06-9d42-87bde9c5db16.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e36f1d26eccaf6e8b220d9b17a76c0930022be9d3611d485fcc183eca3389d00 +size 56894 diff --git a/.lancedb/nltk_chunking.lance/data/7443a4d0-1f00-4d76-8a7f-e19b2d000c5d.lance b/.lancedb/nltk_chunking.lance/data/7443a4d0-1f00-4d76-8a7f-e19b2d000c5d.lance new file mode 100644 index 0000000000000000000000000000000000000000..94fd53b7936626a1e850a2ba2c42faad4635a325 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7443a4d0-1f00-4d76-8a7f-e19b2d000c5d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab5749ab5ef1261b7d468d3e0e2b0b641b80d58725a282cee5e491165eaddfc9 +size 58758 diff --git a/.lancedb/nltk_chunking.lance/data/74f5ac36-579c-422f-a0b6-fb3e6083988c.lance b/.lancedb/nltk_chunking.lance/data/74f5ac36-579c-422f-a0b6-fb3e6083988c.lance new file mode 100644 index 0000000000000000000000000000000000000000..33e58e153d0c086e7a37b8fb04a73e3b5123e6f7 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/74f5ac36-579c-422f-a0b6-fb3e6083988c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b27d062f725fa867466b58b090cd554d1feb6ae1a150b27ef2def86daf76a7c2 +size 55416 diff --git a/.lancedb/nltk_chunking.lance/data/75ed15fb-1c3c-4a31-86ea-726f3ccab941.lance b/.lancedb/nltk_chunking.lance/data/75ed15fb-1c3c-4a31-86ea-726f3ccab941.lance new file mode 100644 index 0000000000000000000000000000000000000000..e13f96e631cf131a7527d05fc7a38ad33ce92f06 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/75ed15fb-1c3c-4a31-86ea-726f3ccab941.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ebb5d117447168e9da8982c39bfcc3a651a8fa1f281da16085804aa248e5c5b +size 55723 diff --git a/.lancedb/nltk_chunking.lance/data/7649126b-8a7c-47c0-a524-fdeebeeb0325.lance b/.lancedb/nltk_chunking.lance/data/7649126b-8a7c-47c0-a524-fdeebeeb0325.lance new file mode 100644 index 0000000000000000000000000000000000000000..dfe73823c8686c6e48a012d29d568b2ece536993 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7649126b-8a7c-47c0-a524-fdeebeeb0325.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62103448414c31eb3a67144f7366bf5f27fbcc4b6f70909016ca654ae224e2df +size 56416 diff --git a/.lancedb/nltk_chunking.lance/data/7670fa18-d402-48f4-8bd4-55bcdc8f94e8.lance b/.lancedb/nltk_chunking.lance/data/7670fa18-d402-48f4-8bd4-55bcdc8f94e8.lance new file mode 100644 index 0000000000000000000000000000000000000000..f14bc4e8e720f2203f7ffe40f0540558369da555 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7670fa18-d402-48f4-8bd4-55bcdc8f94e8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12d06a51f0a45f8b2f4a45244d02463f1bb332c51b736d7d1bf3abdd0e1825dc +size 55626 diff --git a/.lancedb/nltk_chunking.lance/data/76716a28-9a8b-4629-94b4-3d9b6e950ed7.lance b/.lancedb/nltk_chunking.lance/data/76716a28-9a8b-4629-94b4-3d9b6e950ed7.lance new file mode 100644 index 0000000000000000000000000000000000000000..415338fd22fa1234423b3a148dfd0102c3aed927 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/76716a28-9a8b-4629-94b4-3d9b6e950ed7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9723082ea35d8ff7e08f47e9315873dee7be4a43555263491dc78129040610d5 +size 54631 diff --git a/.lancedb/nltk_chunking.lance/data/7673faa0-195e-4f61-a6e2-380ed2eda95d.lance b/.lancedb/nltk_chunking.lance/data/7673faa0-195e-4f61-a6e2-380ed2eda95d.lance new file mode 100644 index 0000000000000000000000000000000000000000..d2c24275748093ac841033eb9217513b37bf53ea --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7673faa0-195e-4f61-a6e2-380ed2eda95d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:727609ab721efc20f0861c251304a00cb6e358e8fa8142f5906e7237a9375b3f +size 56391 diff --git a/.lancedb/nltk_chunking.lance/data/76c57adb-457b-4fcc-b3b0-6b105eae17ed.lance b/.lancedb/nltk_chunking.lance/data/76c57adb-457b-4fcc-b3b0-6b105eae17ed.lance new file mode 100644 index 0000000000000000000000000000000000000000..b8996761ff082e556e2af1a27eb50c0725ecc527 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/76c57adb-457b-4fcc-b3b0-6b105eae17ed.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3814bed73e2c5a6513322debb195ead57e65bfe73e8935814eae2fc1a1e41614 +size 57927 diff --git a/.lancedb/nltk_chunking.lance/data/76c8c4e6-af1c-4dfd-a44b-8f8d1e047397.lance b/.lancedb/nltk_chunking.lance/data/76c8c4e6-af1c-4dfd-a44b-8f8d1e047397.lance new file mode 100644 index 0000000000000000000000000000000000000000..5dfa99bddc78e953be0acda42d05252de7645610 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/76c8c4e6-af1c-4dfd-a44b-8f8d1e047397.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73af91f28c6254e22e39d2224960179ee7999326367da2943ca14b3c6fd6ffb9 +size 56452 diff --git a/.lancedb/nltk_chunking.lance/data/76cbfe3e-b993-4b87-8d97-5aa1e042ee17.lance b/.lancedb/nltk_chunking.lance/data/76cbfe3e-b993-4b87-8d97-5aa1e042ee17.lance new file mode 100644 index 0000000000000000000000000000000000000000..3b0bde2026f526c45b6ba499efed0b94d573ee4e --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/76cbfe3e-b993-4b87-8d97-5aa1e042ee17.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1641f4181e08092e797f5ccc17d16e11fcd88288651bf02419d1320d8897b611 +size 57769 diff --git a/.lancedb/nltk_chunking.lance/data/77a60273-56a5-4b34-8c69-15c8d6c0baf3.lance b/.lancedb/nltk_chunking.lance/data/77a60273-56a5-4b34-8c69-15c8d6c0baf3.lance new file mode 100644 index 0000000000000000000000000000000000000000..22672fe460d891e3be24f4ca8b76e94156205e64 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/77a60273-56a5-4b34-8c69-15c8d6c0baf3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fa4ed43b8999352e3999c6bcac785ee696dd657955cb5501b1f0be21790e8c9 +size 55386 diff --git a/.lancedb/nltk_chunking.lance/data/77cdf885-4fbd-4fb3-a903-f0d0f0bd3a49.lance b/.lancedb/nltk_chunking.lance/data/77cdf885-4fbd-4fb3-a903-f0d0f0bd3a49.lance new file mode 100644 index 0000000000000000000000000000000000000000..f546fc9fd1e9b4391e474e07c54bb2953f2b5e48 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/77cdf885-4fbd-4fb3-a903-f0d0f0bd3a49.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62a25f91ff25e2876fa05400923509b6459ac085029e077486ccb7bb1dddc264 +size 60112 diff --git a/.lancedb/nltk_chunking.lance/data/78c15c96-c6fd-415f-ba67-4fdbc11ebd41.lance b/.lancedb/nltk_chunking.lance/data/78c15c96-c6fd-415f-ba67-4fdbc11ebd41.lance new file mode 100644 index 0000000000000000000000000000000000000000..9316eb073fe76213e24ca3e4dbe06cc7cf0cbdb3 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/78c15c96-c6fd-415f-ba67-4fdbc11ebd41.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dad11731d250a634b53af90d2762caeae78aeb2a68e4aaa463910e26139413e +size 55578 diff --git a/.lancedb/nltk_chunking.lance/data/79573750-07fb-4d73-8ee7-18d37260f818.lance b/.lancedb/nltk_chunking.lance/data/79573750-07fb-4d73-8ee7-18d37260f818.lance new file mode 100644 index 0000000000000000000000000000000000000000..c2c63de9ec0ecb8df14e8459fd7da306bb8278b9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/79573750-07fb-4d73-8ee7-18d37260f818.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42f6d11013ab937a8ab1ee3b1abc798f9fb42876eec4f970ebe899bb9231feae +size 57547 diff --git a/.lancedb/nltk_chunking.lance/data/7ad9c3f5-952e-4eb4-b796-7e828d60218a.lance b/.lancedb/nltk_chunking.lance/data/7ad9c3f5-952e-4eb4-b796-7e828d60218a.lance new file mode 100644 index 0000000000000000000000000000000000000000..b0140d79971ae8b153f6bfd4cae701e06550ebb1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7ad9c3f5-952e-4eb4-b796-7e828d60218a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:410b65a60a904763b77f5c7cfe3a8eb4dfc8da5428fdcc05118a565991f074c9 +size 56605 diff --git a/.lancedb/nltk_chunking.lance/data/7b736861-1e74-47ee-829f-aed616722af0.lance b/.lancedb/nltk_chunking.lance/data/7b736861-1e74-47ee-829f-aed616722af0.lance new file mode 100644 index 0000000000000000000000000000000000000000..9d7cdabac72fc80d198d9058f6e9617ac2495c92 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7b736861-1e74-47ee-829f-aed616722af0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:439d62dce4fc1764eb69ab178dc4c1f755b693c8676e58b66c9c278d917c0e8e +size 57502 diff --git a/.lancedb/nltk_chunking.lance/data/7cdbc1b5-4352-48a6-a4f2-8350d2f23171.lance b/.lancedb/nltk_chunking.lance/data/7cdbc1b5-4352-48a6-a4f2-8350d2f23171.lance new file mode 100644 index 0000000000000000000000000000000000000000..be478096b9f6f5adedb8e2347bcb093a383f96ef --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7cdbc1b5-4352-48a6-a4f2-8350d2f23171.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44d7bcca961ba067eed39600d191c7b0b8e719f85d47dfced84369a88ac73530 +size 56024 diff --git a/.lancedb/nltk_chunking.lance/data/7d4d4605-05fa-4338-b4fb-66bc46b84370.lance b/.lancedb/nltk_chunking.lance/data/7d4d4605-05fa-4338-b4fb-66bc46b84370.lance new file mode 100644 index 0000000000000000000000000000000000000000..373d781622e311f55e102824b1128617278018a8 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7d4d4605-05fa-4338-b4fb-66bc46b84370.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:889a51d69fb1f4896a3a5e071f565883605946281d2c82ab7cb8b0fd8bc3d014 +size 55260 diff --git a/.lancedb/nltk_chunking.lance/data/7d69ddbb-4114-41a5-96c8-de1c177e8b3d.lance b/.lancedb/nltk_chunking.lance/data/7d69ddbb-4114-41a5-96c8-de1c177e8b3d.lance new file mode 100644 index 0000000000000000000000000000000000000000..cc9c26b44658bf52589df4648738945285e84eb9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7d69ddbb-4114-41a5-96c8-de1c177e8b3d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33c11bd11d7f98fdbee141b2bedb55198492c04b09315ec4ed06cb260f142824 +size 52798 diff --git a/.lancedb/nltk_chunking.lance/data/7dbb689a-976b-4cc0-bb2b-70ed3a8633eb.lance b/.lancedb/nltk_chunking.lance/data/7dbb689a-976b-4cc0-bb2b-70ed3a8633eb.lance new file mode 100644 index 0000000000000000000000000000000000000000..b1f536d939d7e5d0158b2244f6cc7f388bf60583 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7dbb689a-976b-4cc0-bb2b-70ed3a8633eb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:672f7410a2e2a0056f6b3722b9eaebd965294295ffbd49a6532333bda77ac80b +size 54057 diff --git a/.lancedb/nltk_chunking.lance/data/7e6310c1-f9c9-48cf-bcb7-7cf827ee801d.lance b/.lancedb/nltk_chunking.lance/data/7e6310c1-f9c9-48cf-bcb7-7cf827ee801d.lance new file mode 100644 index 0000000000000000000000000000000000000000..de9274500568fccd6f02e5d0d9c2fef6d8f9bfd3 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7e6310c1-f9c9-48cf-bcb7-7cf827ee801d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9216fe977275294af3ff988c790661800d8219084a325ce56bdd398ddba6487 +size 54968 diff --git a/.lancedb/nltk_chunking.lance/data/7e93c6cf-0cf8-48e8-a42d-4f2069e4e006.lance b/.lancedb/nltk_chunking.lance/data/7e93c6cf-0cf8-48e8-a42d-4f2069e4e006.lance new file mode 100644 index 0000000000000000000000000000000000000000..21dfa4fc7ebdbc09807c9adad52e5b53423cf9ce --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7e93c6cf-0cf8-48e8-a42d-4f2069e4e006.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dfb1111f3640d3eef8ecb5aa247f443db8cb8af044107ffecbb3f1c8c46db87 +size 55721 diff --git a/.lancedb/nltk_chunking.lance/data/7f31fb33-874b-4cdf-ba37-74a02cdf1885.lance b/.lancedb/nltk_chunking.lance/data/7f31fb33-874b-4cdf-ba37-74a02cdf1885.lance new file mode 100644 index 0000000000000000000000000000000000000000..eda5b88e6af34030c6d0efc7914fab5d2e0ac8a5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7f31fb33-874b-4cdf-ba37-74a02cdf1885.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a5ac359d6738d733bfac0686c9fe4b5bb3b92bb6153ea35b9d3107c512658da +size 54913 diff --git a/.lancedb/nltk_chunking.lance/data/7fdfd7b5-41a0-47ba-8327-45077c234729.lance b/.lancedb/nltk_chunking.lance/data/7fdfd7b5-41a0-47ba-8327-45077c234729.lance new file mode 100644 index 0000000000000000000000000000000000000000..bc7625197f86a16623cb6318a32f322a548e9346 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/7fdfd7b5-41a0-47ba-8327-45077c234729.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78064028908cf823ab87a917e5cf4f56cd03032722a601c68e7cd3ed2d944adf +size 53565 diff --git a/.lancedb/nltk_chunking.lance/data/802cf164-3d37-4fa4-a135-af4f02f544b4.lance b/.lancedb/nltk_chunking.lance/data/802cf164-3d37-4fa4-a135-af4f02f544b4.lance new file mode 100644 index 0000000000000000000000000000000000000000..02fded76ecbbfed77eb6c9971d3e2b630086f899 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/802cf164-3d37-4fa4-a135-af4f02f544b4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaeb432f2c21ed53f6dc2d174d98cae318f09532570145c567f0672e90670f48 +size 54577 diff --git a/.lancedb/nltk_chunking.lance/data/804b390e-a196-4b02-82a8-a011de4a8083.lance b/.lancedb/nltk_chunking.lance/data/804b390e-a196-4b02-82a8-a011de4a8083.lance new file mode 100644 index 0000000000000000000000000000000000000000..03f2f65fd64e37fa0817c4c7d00fdaf6ab015ad1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/804b390e-a196-4b02-82a8-a011de4a8083.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0583ebbf97e3903b81d852341738c832e6d78f9cc56de6cebae9b2e1fdc0743c +size 55736 diff --git a/.lancedb/nltk_chunking.lance/data/8176dbe7-67bf-4e58-91b5-0d1c8617242b.lance b/.lancedb/nltk_chunking.lance/data/8176dbe7-67bf-4e58-91b5-0d1c8617242b.lance new file mode 100644 index 0000000000000000000000000000000000000000..7eaf0fe935f19e2dbea97fbad9db4699ff690191 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/8176dbe7-67bf-4e58-91b5-0d1c8617242b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ba4dd433f1eda9231433cf4760f00c7faf26e4316165f6b6319bfd865c460ed +size 53228 diff --git a/.lancedb/nltk_chunking.lance/data/81e8ad6d-efef-486f-b786-df2e8abc3f78.lance b/.lancedb/nltk_chunking.lance/data/81e8ad6d-efef-486f-b786-df2e8abc3f78.lance new file mode 100644 index 0000000000000000000000000000000000000000..3698be129f3e1d5888e5b5c6b7fac0918a0462ca --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/81e8ad6d-efef-486f-b786-df2e8abc3f78.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e5883787cefe23a6e5366a98d09cbdb614a600156d462a8fa73ba6c7ee794c7 +size 56115 diff --git a/.lancedb/nltk_chunking.lance/data/82e8796a-0146-4217-9d6b-884cd94932e4.lance b/.lancedb/nltk_chunking.lance/data/82e8796a-0146-4217-9d6b-884cd94932e4.lance new file mode 100644 index 0000000000000000000000000000000000000000..1104ff6b469852be05f61d6e1e2c1e6c74f6658a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/82e8796a-0146-4217-9d6b-884cd94932e4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a2d6cbb7d8fb1abbad6b2ac004f7108eae67ea1554db39a0715d7089966e830 +size 56968 diff --git a/.lancedb/nltk_chunking.lance/data/831d7ec3-6f3a-4a41-b3b6-2acc3048b699.lance b/.lancedb/nltk_chunking.lance/data/831d7ec3-6f3a-4a41-b3b6-2acc3048b699.lance new file mode 100644 index 0000000000000000000000000000000000000000..63586a6e7da71cd5885b9136b3f7dc471c64795f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/831d7ec3-6f3a-4a41-b3b6-2acc3048b699.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d4d65e2bd90074a8a1b0441d4f77b2b1ae1bbcb6f212fe2dc6bf04687b180f2 +size 55582 diff --git a/.lancedb/nltk_chunking.lance/data/84414e6c-6e2f-483b-8516-1df13f89959f.lance b/.lancedb/nltk_chunking.lance/data/84414e6c-6e2f-483b-8516-1df13f89959f.lance new file mode 100644 index 0000000000000000000000000000000000000000..7906eef54a8de43341d55a16ab227aff2e83d9e0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/84414e6c-6e2f-483b-8516-1df13f89959f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f06c8cb00b69ef99b3ab33c381de0a7902ca4f4bb4e134144e9577b7980ef102 +size 53788 diff --git a/.lancedb/nltk_chunking.lance/data/844c4705-04f2-43c5-8496-fd0b2ec0d161.lance b/.lancedb/nltk_chunking.lance/data/844c4705-04f2-43c5-8496-fd0b2ec0d161.lance new file mode 100644 index 0000000000000000000000000000000000000000..168b56b402e063b7e68705172ebd54d2bef2cdf6 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/844c4705-04f2-43c5-8496-fd0b2ec0d161.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4f1dea7fe110404129fdeec4059fe8660352022fd6abe92eea5c2d344c60e83 +size 55505 diff --git a/.lancedb/nltk_chunking.lance/data/862600dc-e283-41ea-a8e1-d3f14c1f6d95.lance b/.lancedb/nltk_chunking.lance/data/862600dc-e283-41ea-a8e1-d3f14c1f6d95.lance new file mode 100644 index 0000000000000000000000000000000000000000..319d3071b048110a05a8657fef05fb767e76e27a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/862600dc-e283-41ea-a8e1-d3f14c1f6d95.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a9515f82d752a3d84d51ac3e458597b5075027a1613526ebabec8c9b5302a9a +size 56347 diff --git a/.lancedb/nltk_chunking.lance/data/8690af9e-1399-4a2b-92dd-68ebc2f1cb7e.lance b/.lancedb/nltk_chunking.lance/data/8690af9e-1399-4a2b-92dd-68ebc2f1cb7e.lance new file mode 100644 index 0000000000000000000000000000000000000000..1fa176ebc69e978e05db076c6000c480ddba4fce --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/8690af9e-1399-4a2b-92dd-68ebc2f1cb7e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc8f3acf9eb94b3c783f18741afcb1238ebf6941b2a190dc71d6edaaadab64ed +size 53948 diff --git a/.lancedb/nltk_chunking.lance/data/86c6da5d-23af-44e0-a93a-ea04a08e3af7.lance b/.lancedb/nltk_chunking.lance/data/86c6da5d-23af-44e0-a93a-ea04a08e3af7.lance new file mode 100644 index 0000000000000000000000000000000000000000..ce3bd21b39c13b617226fdaeb1a16ddfd1673ddb --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/86c6da5d-23af-44e0-a93a-ea04a08e3af7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a778d2aa59f7e547749fedd44f90d1cb1b8845a6dd17aba3468ef77ddbf7fa03 +size 56918 diff --git a/.lancedb/nltk_chunking.lance/data/8797b0f0-248c-41d3-b6e0-d2c8a0a749f4.lance b/.lancedb/nltk_chunking.lance/data/8797b0f0-248c-41d3-b6e0-d2c8a0a749f4.lance new file mode 100644 index 0000000000000000000000000000000000000000..c0854c199c3e32c5805592b342267173965236fd --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/8797b0f0-248c-41d3-b6e0-d2c8a0a749f4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef049a0f65b4503497e888d6c893f044cbe89b573fdf390587d7a2f63374458e +size 54184 diff --git a/.lancedb/nltk_chunking.lance/data/887b64ea-ff31-4bee-86d1-56babb719f79.lance b/.lancedb/nltk_chunking.lance/data/887b64ea-ff31-4bee-86d1-56babb719f79.lance new file mode 100644 index 0000000000000000000000000000000000000000..5655f10a74c69061bd473059bab36c24db05c8a1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/887b64ea-ff31-4bee-86d1-56babb719f79.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3aae048c9566e079fcae214ed6304ae9f3fc5291dcfb844ba807a13c531dfd3 +size 57828 diff --git a/.lancedb/nltk_chunking.lance/data/899edc14-6f3d-4436-952f-b0391a0b0555.lance b/.lancedb/nltk_chunking.lance/data/899edc14-6f3d-4436-952f-b0391a0b0555.lance new file mode 100644 index 0000000000000000000000000000000000000000..30701322142edc4d8105cbd078524dcd91d1854b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/899edc14-6f3d-4436-952f-b0391a0b0555.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc9fd7c40fec0dc8444ca99b19506d90740459e44493b439a6a0cd907456ec0d +size 55253 diff --git a/.lancedb/nltk_chunking.lance/data/8a061c7f-b8f2-45a6-9d96-6f1751536812.lance b/.lancedb/nltk_chunking.lance/data/8a061c7f-b8f2-45a6-9d96-6f1751536812.lance new file mode 100644 index 0000000000000000000000000000000000000000..321125fe687b7ae3957f7b4995994e0ae9fa50dd --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/8a061c7f-b8f2-45a6-9d96-6f1751536812.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f04f3e664d700e0890cbcc5de03fcc256f39ae4e50b89691fd254e9a456db298 +size 59034 diff --git a/.lancedb/nltk_chunking.lance/data/8b9bb228-c0f1-4cc2-8c7a-b5041849eb78.lance b/.lancedb/nltk_chunking.lance/data/8b9bb228-c0f1-4cc2-8c7a-b5041849eb78.lance new file mode 100644 index 0000000000000000000000000000000000000000..5d3e68790fec1d8dffb3ace8ad5558da01b833f9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/8b9bb228-c0f1-4cc2-8c7a-b5041849eb78.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:591bfa6d9751a2027cb4af4e1a0bc264c80fe0deeabed6d3a5677660a9dad79d +size 57395 diff --git a/.lancedb/nltk_chunking.lance/data/8bcc8e31-c979-46b1-8b48-da7d4f1bcab4.lance b/.lancedb/nltk_chunking.lance/data/8bcc8e31-c979-46b1-8b48-da7d4f1bcab4.lance new file mode 100644 index 0000000000000000000000000000000000000000..c50d4d3963b1ec878ff52a83bd979c0329dca8d9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/8bcc8e31-c979-46b1-8b48-da7d4f1bcab4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5cfdf089abaa1cadad5fbae814f04fd9dbdbbc09bddcc26af27da9ae70146d8 +size 55813 diff --git a/.lancedb/nltk_chunking.lance/data/8c09abce-803c-412c-bebe-a5782938a9a9.lance b/.lancedb/nltk_chunking.lance/data/8c09abce-803c-412c-bebe-a5782938a9a9.lance new file mode 100644 index 0000000000000000000000000000000000000000..d3c5fc3d62ec68986edc472fd275977e37362e30 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/8c09abce-803c-412c-bebe-a5782938a9a9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a408ec62e1951c76248da3e6475fb467c10a4aebb7d2458b5f1aa0cd01c7244e +size 56651 diff --git a/.lancedb/nltk_chunking.lance/data/8cab93a1-92a2-4330-be29-9a00902dce57.lance b/.lancedb/nltk_chunking.lance/data/8cab93a1-92a2-4330-be29-9a00902dce57.lance new file mode 100644 index 0000000000000000000000000000000000000000..604ca27f0154461003ab4fda6d00ae77f566b463 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/8cab93a1-92a2-4330-be29-9a00902dce57.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d252bf3f5f15b872eff0c9d8c62d56587d2838032a3bd8000e6f2846d45669a5 +size 53689 diff --git a/.lancedb/nltk_chunking.lance/data/8ec093e1-ec34-4d00-9fdf-de4bce825994.lance b/.lancedb/nltk_chunking.lance/data/8ec093e1-ec34-4d00-9fdf-de4bce825994.lance new file mode 100644 index 0000000000000000000000000000000000000000..984f4aad895118fa31eaf70887bf3c27d7264b9a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/8ec093e1-ec34-4d00-9fdf-de4bce825994.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:373fd7e1dc7dceb7d142b6dd2c8d95013ef2b61bf794f0904b33e1a941d5139b +size 56225 diff --git a/.lancedb/nltk_chunking.lance/data/8f6cb2a0-b97c-4426-ac47-9c1bb22bc05e.lance b/.lancedb/nltk_chunking.lance/data/8f6cb2a0-b97c-4426-ac47-9c1bb22bc05e.lance new file mode 100644 index 0000000000000000000000000000000000000000..4a30bd23774691a50a9a77d25fa74a45360f905f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/8f6cb2a0-b97c-4426-ac47-9c1bb22bc05e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:123887e787e1a3554970285a9344de47753a31fee9669e7df70855a74d1e4daf +size 61790 diff --git a/.lancedb/nltk_chunking.lance/data/90976ea6-c365-46d7-be0b-8f77a5b2da61.lance b/.lancedb/nltk_chunking.lance/data/90976ea6-c365-46d7-be0b-8f77a5b2da61.lance new file mode 100644 index 0000000000000000000000000000000000000000..7ab07de8e9ee51dc4548c80d310eafff54b6869f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/90976ea6-c365-46d7-be0b-8f77a5b2da61.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5d6a211b2c818f65aca51caaeef8207995d438e2d8e8a9c8ffbdc47f1fccfc8 +size 53793 diff --git a/.lancedb/nltk_chunking.lance/data/90e81800-8d1e-46d3-aaf7-2f3df9c89381.lance b/.lancedb/nltk_chunking.lance/data/90e81800-8d1e-46d3-aaf7-2f3df9c89381.lance new file mode 100644 index 0000000000000000000000000000000000000000..9f3a0d39339fe1a1080fa35218c5316e05d8837b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/90e81800-8d1e-46d3-aaf7-2f3df9c89381.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62ee70f94576ca34d057d1d28667a6dd1f6519c3edd960f1e1e6665a54a2c9f2 +size 53447 diff --git a/.lancedb/nltk_chunking.lance/data/90efff81-deba-458d-b712-20a878088983.lance b/.lancedb/nltk_chunking.lance/data/90efff81-deba-458d-b712-20a878088983.lance new file mode 100644 index 0000000000000000000000000000000000000000..2de949e91035eb9d0ecadd2412e3d7fc62d25cf0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/90efff81-deba-458d-b712-20a878088983.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c729483408aeb0675efd360562076c4c0c63062db17ec8e90294c5724b2a7f76 +size 56026 diff --git a/.lancedb/nltk_chunking.lance/data/9178ef81-e717-44f9-bfa2-c75fee633984.lance b/.lancedb/nltk_chunking.lance/data/9178ef81-e717-44f9-bfa2-c75fee633984.lance new file mode 100644 index 0000000000000000000000000000000000000000..e6de034a62fdaeebb524ebf965f05a23a6243262 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9178ef81-e717-44f9-bfa2-c75fee633984.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9611cf47e185dabb58d7cf3fb4fdbf2d3bcd708030556e5c744289ebc4d880ec +size 56372 diff --git a/.lancedb/nltk_chunking.lance/data/91e26e1c-3554-4c5b-851e-bdc8b801a31c.lance b/.lancedb/nltk_chunking.lance/data/91e26e1c-3554-4c5b-851e-bdc8b801a31c.lance new file mode 100644 index 0000000000000000000000000000000000000000..fec40021baa9cc2c7aa8b1a9f9190f1c30c50a10 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/91e26e1c-3554-4c5b-851e-bdc8b801a31c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d77ef8bf29eb7334542c7500c5e41ba7663c677edd32c468f8b3a3d91f96f5ae +size 58447 diff --git a/.lancedb/nltk_chunking.lance/data/92061e60-f7cc-4693-8216-f3919b1f6043.lance b/.lancedb/nltk_chunking.lance/data/92061e60-f7cc-4693-8216-f3919b1f6043.lance new file mode 100644 index 0000000000000000000000000000000000000000..43009bb70314d278ae0d25779f060c2d875d9ef9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/92061e60-f7cc-4693-8216-f3919b1f6043.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5787f3320453ca2bb9ec9efd3af6ab97960bed90719d60d3ed61c315561eff5d +size 55673 diff --git a/.lancedb/nltk_chunking.lance/data/9237f60a-386c-4a9a-adf9-6949bdc1ae54.lance b/.lancedb/nltk_chunking.lance/data/9237f60a-386c-4a9a-adf9-6949bdc1ae54.lance new file mode 100644 index 0000000000000000000000000000000000000000..227ff8bfcf481984c12878b5b0485c5ec55c2fa9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9237f60a-386c-4a9a-adf9-6949bdc1ae54.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee31d4546c0efb946a178d945ab7e92391d61f255585d0cc4080394d7f3db5ac +size 57012 diff --git a/.lancedb/nltk_chunking.lance/data/92480cb5-4847-4b9d-b9d0-25d78b2da327.lance b/.lancedb/nltk_chunking.lance/data/92480cb5-4847-4b9d-b9d0-25d78b2da327.lance new file mode 100644 index 0000000000000000000000000000000000000000..454e66f8ee04737484683ffbce3580656945000d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/92480cb5-4847-4b9d-b9d0-25d78b2da327.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e381d9c1691de00a37832f36ddc9e2cbb163db24b8f8fd229c2f7aa3f868ca7b +size 53097 diff --git a/.lancedb/nltk_chunking.lance/data/92738c4f-caba-4834-9eb6-b56c5155132d.lance b/.lancedb/nltk_chunking.lance/data/92738c4f-caba-4834-9eb6-b56c5155132d.lance new file mode 100644 index 0000000000000000000000000000000000000000..8dc1ace6781e3cdf2efcc5d6b77dcef40a0c423d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/92738c4f-caba-4834-9eb6-b56c5155132d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40865646c80ef770f045cbda7e904da0c67d3ba33887e5e6628d068def98275e +size 56736 diff --git a/.lancedb/nltk_chunking.lance/data/927d9287-a7a7-4004-b089-32f2cde6f6ac.lance b/.lancedb/nltk_chunking.lance/data/927d9287-a7a7-4004-b089-32f2cde6f6ac.lance new file mode 100644 index 0000000000000000000000000000000000000000..0583327973148c41bba955dfe5d2831058116a70 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/927d9287-a7a7-4004-b089-32f2cde6f6ac.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:898fb95131bae644ca206c7e0ac5af8687b1b3704e7b76e24661e6ca3dc3b058 +size 54177 diff --git a/.lancedb/nltk_chunking.lance/data/9350a802-9de0-4581-a025-08930368a1db.lance b/.lancedb/nltk_chunking.lance/data/9350a802-9de0-4581-a025-08930368a1db.lance new file mode 100644 index 0000000000000000000000000000000000000000..718b7c46f3da4c3465bb981c291cc7e84c9d1821 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9350a802-9de0-4581-a025-08930368a1db.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af0a062804376a11373e7ffcfe7c8ac16466c6aff4e206a1d3cb2c5807ee29c9 +size 54982 diff --git a/.lancedb/nltk_chunking.lance/data/935a7836-5618-4883-9f90-a83de270d868.lance b/.lancedb/nltk_chunking.lance/data/935a7836-5618-4883-9f90-a83de270d868.lance new file mode 100644 index 0000000000000000000000000000000000000000..e7f6322567504d5796199619ce9707831031929b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/935a7836-5618-4883-9f90-a83de270d868.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd34f1769f6dade7c1826472a0b058940f7494fee1adc96425cf876950e41680 +size 54627 diff --git a/.lancedb/nltk_chunking.lance/data/93c76089-39f9-4874-b6a1-c9d9a15ece8c.lance b/.lancedb/nltk_chunking.lance/data/93c76089-39f9-4874-b6a1-c9d9a15ece8c.lance new file mode 100644 index 0000000000000000000000000000000000000000..4c47e4a2f86ab0a086984bc17202e35c8defa4af --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/93c76089-39f9-4874-b6a1-c9d9a15ece8c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf038fab2fc008b0a76b62ee21f9a5753dfde2035a923fc2953b3c47db93c5ff +size 53362 diff --git a/.lancedb/nltk_chunking.lance/data/967cc72b-7bb3-432b-9c13-dbf6c51088f8.lance b/.lancedb/nltk_chunking.lance/data/967cc72b-7bb3-432b-9c13-dbf6c51088f8.lance new file mode 100644 index 0000000000000000000000000000000000000000..a0d41c90d597d05d0a1abceba6d9144b0fa4bb35 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/967cc72b-7bb3-432b-9c13-dbf6c51088f8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7a082e9f2d90e08f5e7b9dac1248c8d9f708e899a36a16c91941909d444fadb +size 57117 diff --git a/.lancedb/nltk_chunking.lance/data/971b6ab5-8060-449d-bcaa-1a86ff7d1dbc.lance b/.lancedb/nltk_chunking.lance/data/971b6ab5-8060-449d-bcaa-1a86ff7d1dbc.lance new file mode 100644 index 0000000000000000000000000000000000000000..b0cd4098b919068290e662205d448c85d8d24449 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/971b6ab5-8060-449d-bcaa-1a86ff7d1dbc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d616b6d6e91751b2c5bc6749bf794a97090555ecb055c806322aa4d3ef2b797e +size 55017 diff --git a/.lancedb/nltk_chunking.lance/data/9735adcd-b163-42de-a602-d82eeffb34f8.lance b/.lancedb/nltk_chunking.lance/data/9735adcd-b163-42de-a602-d82eeffb34f8.lance new file mode 100644 index 0000000000000000000000000000000000000000..695ddcb7a5680d5807fc000aa9ab48315f8cddc2 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9735adcd-b163-42de-a602-d82eeffb34f8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab89eb1b3233ecba026b9377d64af163494c648da4353434aff2e543307ed79d +size 54053 diff --git a/.lancedb/nltk_chunking.lance/data/98e5a5a2-1d7d-4767-a46f-bc9817b035e1.lance b/.lancedb/nltk_chunking.lance/data/98e5a5a2-1d7d-4767-a46f-bc9817b035e1.lance new file mode 100644 index 0000000000000000000000000000000000000000..62d85d6392ff314086d800a2e5fd14f8f0959623 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/98e5a5a2-1d7d-4767-a46f-bc9817b035e1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecbc6b938d0b872d1711de8957922475391af44759ed4883c9f5d4def45d8e89 +size 57766 diff --git a/.lancedb/nltk_chunking.lance/data/99b9a4df-dc66-428e-b831-18ffa8d76354.lance b/.lancedb/nltk_chunking.lance/data/99b9a4df-dc66-428e-b831-18ffa8d76354.lance new file mode 100644 index 0000000000000000000000000000000000000000..fce5eb8f51286c8f81dfdb3bc4ccc3d97522678a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/99b9a4df-dc66-428e-b831-18ffa8d76354.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a9032e728584a72155cee983d6b8788e7101edc13b3f4d3039f5f9df5893dec +size 54049 diff --git a/.lancedb/nltk_chunking.lance/data/99d10824-f980-49da-8eb9-091ea17704b0.lance b/.lancedb/nltk_chunking.lance/data/99d10824-f980-49da-8eb9-091ea17704b0.lance new file mode 100644 index 0000000000000000000000000000000000000000..010ba68ff901167edcee283bc5afd715bdeee08f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/99d10824-f980-49da-8eb9-091ea17704b0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a483c0e78f589f34ac20c54eb106085d2d2a9d63111507f0ae304fea66018a74 +size 56492 diff --git a/.lancedb/nltk_chunking.lance/data/9a012b14-fe2d-4bdd-aa75-8f811f985eef.lance b/.lancedb/nltk_chunking.lance/data/9a012b14-fe2d-4bdd-aa75-8f811f985eef.lance new file mode 100644 index 0000000000000000000000000000000000000000..e9284b410f90f4ad3b65e6a36f21ef6820de3795 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9a012b14-fe2d-4bdd-aa75-8f811f985eef.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:626de29215c0cfe9ce0ad9bb8a3b5d3c8f5ef8dfa7bb930ebd01407aab1cca17 +size 54768 diff --git a/.lancedb/nltk_chunking.lance/data/9a264cc3-f516-4a5c-8d17-61f97daf1fe2.lance b/.lancedb/nltk_chunking.lance/data/9a264cc3-f516-4a5c-8d17-61f97daf1fe2.lance new file mode 100644 index 0000000000000000000000000000000000000000..1e0e4bc14c99e14fa9e7c44013cea643f7e41dc4 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9a264cc3-f516-4a5c-8d17-61f97daf1fe2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89614214edb52a20f8235c1b8a5f31713b5eeacd2f7cfb1a31c9da4c2e141a33 +size 56890 diff --git a/.lancedb/nltk_chunking.lance/data/9ada5ede-0984-4a53-8e10-5e9def9c5ef2.lance b/.lancedb/nltk_chunking.lance/data/9ada5ede-0984-4a53-8e10-5e9def9c5ef2.lance new file mode 100644 index 0000000000000000000000000000000000000000..1e8f7c7304c7e2953f05aaf20ed142a0bff9c723 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9ada5ede-0984-4a53-8e10-5e9def9c5ef2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caebb51b353082774b42d3f84bc3ed7d075b6582ff01ffb412fa0eb0515f3481 +size 54996 diff --git a/.lancedb/nltk_chunking.lance/data/9b47f09f-646f-4147-8dfb-8f39632b78fc.lance b/.lancedb/nltk_chunking.lance/data/9b47f09f-646f-4147-8dfb-8f39632b78fc.lance new file mode 100644 index 0000000000000000000000000000000000000000..0cd89c3ecc13676d24a8173d4c429574f0f1c834 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9b47f09f-646f-4147-8dfb-8f39632b78fc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0c14b2c712acee4f58d8de1538aa0bccd5b20558b8a87d71830e3addd2bc8d0 +size 58791 diff --git a/.lancedb/nltk_chunking.lance/data/9b48bd4a-dbb7-4f80-a07d-ee77a16ba1e9.lance b/.lancedb/nltk_chunking.lance/data/9b48bd4a-dbb7-4f80-a07d-ee77a16ba1e9.lance new file mode 100644 index 0000000000000000000000000000000000000000..622e97258e0af1533ffea799912cbe24a939db6b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9b48bd4a-dbb7-4f80-a07d-ee77a16ba1e9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36deb7afd05d1736f7f3e68d8c847ee18f96cd76296471d582317504b7c74cc6 +size 63843 diff --git a/.lancedb/nltk_chunking.lance/data/9c1dd097-5bad-4302-a393-a7447b993314.lance b/.lancedb/nltk_chunking.lance/data/9c1dd097-5bad-4302-a393-a7447b993314.lance new file mode 100644 index 0000000000000000000000000000000000000000..ebe178d695ace1ccd2fc2d093d95c5062cb12534 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9c1dd097-5bad-4302-a393-a7447b993314.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd18e8642a7060d516702aa63b83e05d7fde01e945373cde5da418e6c6f7ab15 +size 56299 diff --git a/.lancedb/nltk_chunking.lance/data/9c5bc62f-2ca3-447e-933b-cae280dd7f68.lance b/.lancedb/nltk_chunking.lance/data/9c5bc62f-2ca3-447e-933b-cae280dd7f68.lance new file mode 100644 index 0000000000000000000000000000000000000000..3d47fc3a68cc136020b7cf42404e0f454568dc1e --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9c5bc62f-2ca3-447e-933b-cae280dd7f68.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30c26a0241ccac0926384026677dab70d56bb1581449eebce450a800a22e3d58 +size 55721 diff --git a/.lancedb/nltk_chunking.lance/data/9c674095-1dd2-433a-9aa8-711bfc28dbd2.lance b/.lancedb/nltk_chunking.lance/data/9c674095-1dd2-433a-9aa8-711bfc28dbd2.lance new file mode 100644 index 0000000000000000000000000000000000000000..a9b0466dd393066b8706378b4a2daf1f5ec8cbc3 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9c674095-1dd2-433a-9aa8-711bfc28dbd2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c023431078ad328021e491a06d49fd4c7b2f1d2ac2734264aaf03de650f4e8bf +size 55150 diff --git a/.lancedb/nltk_chunking.lance/data/9d4c0d35-316d-451e-865d-57435b882bb7.lance b/.lancedb/nltk_chunking.lance/data/9d4c0d35-316d-451e-865d-57435b882bb7.lance new file mode 100644 index 0000000000000000000000000000000000000000..490a4b71d4b4cfee97f8089c74798842c1a77b37 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9d4c0d35-316d-451e-865d-57435b882bb7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1437a2092c0d5bf220deb5a0a8c948931b638ff273f85d387a7e81a9611a3cb +size 55610 diff --git a/.lancedb/nltk_chunking.lance/data/9d8b5714-2149-421f-9b26-17c12d21c429.lance b/.lancedb/nltk_chunking.lance/data/9d8b5714-2149-421f-9b26-17c12d21c429.lance new file mode 100644 index 0000000000000000000000000000000000000000..6eabe35451aa833cef17614fe2a3bae3542bd4a6 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9d8b5714-2149-421f-9b26-17c12d21c429.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24084b28736ae1c99b2bbf4ca193c0775a2adc7ba85890a45e324c46fcc55e99 +size 54860 diff --git a/.lancedb/nltk_chunking.lance/data/9e5cbb8f-8d4c-4a84-b684-a0cc6d67fc4e.lance b/.lancedb/nltk_chunking.lance/data/9e5cbb8f-8d4c-4a84-b684-a0cc6d67fc4e.lance new file mode 100644 index 0000000000000000000000000000000000000000..c7c8f693f8b693854580c1ab526235343e7b847c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9e5cbb8f-8d4c-4a84-b684-a0cc6d67fc4e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5247bcbc99abedeeb6f98c5892d324a921830b365624ffe6019ce360c96911fc +size 56574 diff --git a/.lancedb/nltk_chunking.lance/data/9e7e37f7-6810-48d0-8d3c-879dfdbfeca8.lance b/.lancedb/nltk_chunking.lance/data/9e7e37f7-6810-48d0-8d3c-879dfdbfeca8.lance new file mode 100644 index 0000000000000000000000000000000000000000..7cfc258c7da25d51d296ad31cdbafed309cc368a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9e7e37f7-6810-48d0-8d3c-879dfdbfeca8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cec60384547393cd8a4def0874aceacd9a2775b7e6b8fb2cb278ad7005e735d +size 60458 diff --git a/.lancedb/nltk_chunking.lance/data/9ed2c8ec-467f-48d9-9ff0-e8f8b01d7873.lance b/.lancedb/nltk_chunking.lance/data/9ed2c8ec-467f-48d9-9ff0-e8f8b01d7873.lance new file mode 100644 index 0000000000000000000000000000000000000000..774b3cf6688df23fd9959d912ad342593e62cd2c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9ed2c8ec-467f-48d9-9ff0-e8f8b01d7873.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9035ff5a981c96875577d5bb3529a368424effa13cd2f7a317cceee450a578d +size 55666 diff --git a/.lancedb/nltk_chunking.lance/data/9f10564d-55f5-4ed9-841a-bd261967ab0e.lance b/.lancedb/nltk_chunking.lance/data/9f10564d-55f5-4ed9-841a-bd261967ab0e.lance new file mode 100644 index 0000000000000000000000000000000000000000..45711566eff7033307886e221ec3bd387e5b0873 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9f10564d-55f5-4ed9-841a-bd261967ab0e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d967423bc6eb113051506c3749fd4ef7b5a87e1918c14ac7bdac9c88315d99fd +size 55443 diff --git a/.lancedb/nltk_chunking.lance/data/9fcf78b4-22ff-450b-842c-65bd35d12866.lance b/.lancedb/nltk_chunking.lance/data/9fcf78b4-22ff-450b-842c-65bd35d12866.lance new file mode 100644 index 0000000000000000000000000000000000000000..e4eb7d8c8a7aec2ca55d3814a18d3f35a7a668ec --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/9fcf78b4-22ff-450b-842c-65bd35d12866.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19aff3ab83f6d783e650e9b497776e3d01ad572a0969546df221e0626cd10811 +size 55330 diff --git a/.lancedb/nltk_chunking.lance/data/a13604c2-e177-4511-afd8-3cf053a8f47b.lance b/.lancedb/nltk_chunking.lance/data/a13604c2-e177-4511-afd8-3cf053a8f47b.lance new file mode 100644 index 0000000000000000000000000000000000000000..7de8598a621c8cf92703c41990737c71f01d0c59 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/a13604c2-e177-4511-afd8-3cf053a8f47b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22f0ca142d1fdc9620158cd407d3f78611ba84223ae08cfa730ee23f264b9caf +size 53562 diff --git a/.lancedb/nltk_chunking.lance/data/a13dd2e7-bf94-4e9a-9ba3-e29ee02509ee.lance b/.lancedb/nltk_chunking.lance/data/a13dd2e7-bf94-4e9a-9ba3-e29ee02509ee.lance new file mode 100644 index 0000000000000000000000000000000000000000..63adef3332bae64478d8aa3cae85d21bb341aab7 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/a13dd2e7-bf94-4e9a-9ba3-e29ee02509ee.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06eca987e0eeca2d153074624529196520e4e57762a04297cf75215a56658eae +size 55637 diff --git a/.lancedb/nltk_chunking.lance/data/a17488eb-77fc-4fe0-9ef4-13205a06344a.lance b/.lancedb/nltk_chunking.lance/data/a17488eb-77fc-4fe0-9ef4-13205a06344a.lance new file mode 100644 index 0000000000000000000000000000000000000000..53fc1cbee313523995c4b148056e8969fbca2150 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/a17488eb-77fc-4fe0-9ef4-13205a06344a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b76f82fcaa46efb2ce021149b83e08e05377f9dfe349b71c3352dadf6df5a94 +size 55077 diff --git a/.lancedb/nltk_chunking.lance/data/a291498e-4e5a-4d47-91fd-d7dd07031020.lance b/.lancedb/nltk_chunking.lance/data/a291498e-4e5a-4d47-91fd-d7dd07031020.lance new file mode 100644 index 0000000000000000000000000000000000000000..7f063c922a128d70f3c3d7031f9e2042cc45d514 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/a291498e-4e5a-4d47-91fd-d7dd07031020.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1f5b7da62bf1680387d3585a631e9920e888ee1f7fdf0a4ccfb5d4afd7724bc +size 57420 diff --git a/.lancedb/nltk_chunking.lance/data/a2ce63c0-cbef-4547-bc97-ecf9ce2eef75.lance b/.lancedb/nltk_chunking.lance/data/a2ce63c0-cbef-4547-bc97-ecf9ce2eef75.lance new file mode 100644 index 0000000000000000000000000000000000000000..4a94889bc58657379589e974a5d77c7fb3070ed7 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/a2ce63c0-cbef-4547-bc97-ecf9ce2eef75.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:038eff5cf12c5518eecd97f0e9c4da4365af413c20c0760261380611b8275ed7 +size 55619 diff --git a/.lancedb/nltk_chunking.lance/data/a2e00800-07cd-490c-8cc3-a7d308c70dad.lance b/.lancedb/nltk_chunking.lance/data/a2e00800-07cd-490c-8cc3-a7d308c70dad.lance new file mode 100644 index 0000000000000000000000000000000000000000..960e73db7710fc9a951328a7ccfb8cd4f31324b4 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/a2e00800-07cd-490c-8cc3-a7d308c70dad.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9f07a1504908c5420f99553c1419d3e16983cbfa9cc0893d57689f13ea2c08e +size 53952 diff --git a/.lancedb/nltk_chunking.lance/data/a4f6f9d4-fb0c-450e-9a62-1397b90ee801.lance b/.lancedb/nltk_chunking.lance/data/a4f6f9d4-fb0c-450e-9a62-1397b90ee801.lance new file mode 100644 index 0000000000000000000000000000000000000000..bf8da433ff853eeb24ad9ed2b5d53e6e62a1a0b9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/a4f6f9d4-fb0c-450e-9a62-1397b90ee801.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c2f037f1e099929d5151cb1b05614b4046f6d7c8b32c0d1f26908125f184e8d +size 54923 diff --git a/.lancedb/nltk_chunking.lance/data/a510c48e-92c5-4bb5-a38e-af9228d3e2f1.lance b/.lancedb/nltk_chunking.lance/data/a510c48e-92c5-4bb5-a38e-af9228d3e2f1.lance new file mode 100644 index 0000000000000000000000000000000000000000..93e894e417d8ccea6f418ef1d30e1e0afcb05a27 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/a510c48e-92c5-4bb5-a38e-af9228d3e2f1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:595c1a48ee69b7ea9ef37dd15f1a534a88cec50ae57045972f9ec0a8bde7a8ff +size 58613 diff --git a/.lancedb/nltk_chunking.lance/data/a6081ed5-8a87-4337-a820-e9217842fafe.lance b/.lancedb/nltk_chunking.lance/data/a6081ed5-8a87-4337-a820-e9217842fafe.lance new file mode 100644 index 0000000000000000000000000000000000000000..a42d8e682e450ae191fb713b7df5982b28dab70f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/a6081ed5-8a87-4337-a820-e9217842fafe.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6223199a24cfcae41132424b1d54ad3c580693b59f78309988427be41c26da7 +size 53845 diff --git a/.lancedb/nltk_chunking.lance/data/a7088f28-7524-498e-91fc-bf3b13338666.lance b/.lancedb/nltk_chunking.lance/data/a7088f28-7524-498e-91fc-bf3b13338666.lance new file mode 100644 index 0000000000000000000000000000000000000000..c1577771ff29c5e3fb177e97235aef43fcb42ddd --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/a7088f28-7524-498e-91fc-bf3b13338666.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8030c57c6545da2528a2663f4b33468324f6008dec2fd2f37856930792a57b3e +size 54656 diff --git a/.lancedb/nltk_chunking.lance/data/a724343a-be27-4ae7-a448-ec138508d5c5.lance b/.lancedb/nltk_chunking.lance/data/a724343a-be27-4ae7-a448-ec138508d5c5.lance new file mode 100644 index 0000000000000000000000000000000000000000..f0bd102993884d03db79a33e75401252dd336473 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/a724343a-be27-4ae7-a448-ec138508d5c5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca7be9cd9d9a5d7cdf0414166363d3e6b18ec7cfbe29b18fb4049478d105fd11 +size 56540 diff --git a/.lancedb/nltk_chunking.lance/data/aa5fcd32-cdbe-418d-9b64-80c80e4e2e6e.lance b/.lancedb/nltk_chunking.lance/data/aa5fcd32-cdbe-418d-9b64-80c80e4e2e6e.lance new file mode 100644 index 0000000000000000000000000000000000000000..74f72629766694b42519843e093139e80880127e --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/aa5fcd32-cdbe-418d-9b64-80c80e4e2e6e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b96337d9fd4f7913cf9322a3651af5977a5c3a82393970eb537188627f7c45f6 +size 57136 diff --git a/.lancedb/nltk_chunking.lance/data/aa68039a-b1fe-490d-8197-5fa4d95551ac.lance b/.lancedb/nltk_chunking.lance/data/aa68039a-b1fe-490d-8197-5fa4d95551ac.lance new file mode 100644 index 0000000000000000000000000000000000000000..f7443db9f56295fb9d8ffde16119ca2c56513b2c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/aa68039a-b1fe-490d-8197-5fa4d95551ac.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c52708591626d0d27040e95752c57f4d7a126beb6a3cd3c5aabbfa9acbcc55bd +size 54069 diff --git a/.lancedb/nltk_chunking.lance/data/ab2259b0-8de9-4199-b660-3422651741cd.lance b/.lancedb/nltk_chunking.lance/data/ab2259b0-8de9-4199-b660-3422651741cd.lance new file mode 100644 index 0000000000000000000000000000000000000000..e667368ffb1a6a10af4c6a4a9dc00a786baede5d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ab2259b0-8de9-4199-b660-3422651741cd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15d6331e992459df04ce274b0c60a4c914f685ee315dfb2071b0ecc03944cd7e +size 56010 diff --git a/.lancedb/nltk_chunking.lance/data/ac7a346d-ea9e-47bb-b704-dd5d246c2755.lance b/.lancedb/nltk_chunking.lance/data/ac7a346d-ea9e-47bb-b704-dd5d246c2755.lance new file mode 100644 index 0000000000000000000000000000000000000000..3e78939a9250704a7628a8bd9f75bace3bd6b161 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ac7a346d-ea9e-47bb-b704-dd5d246c2755.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78a7ef4a0781d095d99adcfecaba5e9b2dc3c6f5a2a54138e6034edda9178062 +size 58094 diff --git a/.lancedb/nltk_chunking.lance/data/aca2e8a1-5983-4bb2-8572-705dcefbba54.lance b/.lancedb/nltk_chunking.lance/data/aca2e8a1-5983-4bb2-8572-705dcefbba54.lance new file mode 100644 index 0000000000000000000000000000000000000000..bf6b66d79a6d0b1ad06631d149c4bf325c467326 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/aca2e8a1-5983-4bb2-8572-705dcefbba54.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c2b710bc0e6c6c191678b0883cbf6a59d0a2b274fbced66178d790dda701e17 +size 56403 diff --git a/.lancedb/nltk_chunking.lance/data/ad03b403-aee0-4c9a-bf9d-c11c16c99867.lance b/.lancedb/nltk_chunking.lance/data/ad03b403-aee0-4c9a-bf9d-c11c16c99867.lance new file mode 100644 index 0000000000000000000000000000000000000000..2768d685e7cb6106b3bfb48b559885ee40bc449c --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ad03b403-aee0-4c9a-bf9d-c11c16c99867.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ed8d9e380db68813d336d779830a889804dabc86be070fcbf19e4de6c49b55d +size 54314 diff --git a/.lancedb/nltk_chunking.lance/data/ad23e125-cddb-44a9-ad83-ff0a43201d1b.lance b/.lancedb/nltk_chunking.lance/data/ad23e125-cddb-44a9-ad83-ff0a43201d1b.lance new file mode 100644 index 0000000000000000000000000000000000000000..4056d495227b16da8195b434acd7978a1e681499 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ad23e125-cddb-44a9-ad83-ff0a43201d1b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7ab71b02f1b613d1dc8a99e4d17662d523f92129d090adf9cf1ea7121ea35dd +size 56010 diff --git a/.lancedb/nltk_chunking.lance/data/adfef77e-bbf0-4004-8007-11a3b750461d.lance b/.lancedb/nltk_chunking.lance/data/adfef77e-bbf0-4004-8007-11a3b750461d.lance new file mode 100644 index 0000000000000000000000000000000000000000..41328227364b10411203f389bdbbd412b4df4060 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/adfef77e-bbf0-4004-8007-11a3b750461d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be4d3d28afc27d1827bece0ac60f3a6e2e9e1b0a8d0758202f3867d5618291ec +size 55807 diff --git a/.lancedb/nltk_chunking.lance/data/ae60d056-48d6-45da-8ab9-0e103692f056.lance b/.lancedb/nltk_chunking.lance/data/ae60d056-48d6-45da-8ab9-0e103692f056.lance new file mode 100644 index 0000000000000000000000000000000000000000..a1c5267f4414f50a5592f74a2b1bf2a30505da27 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ae60d056-48d6-45da-8ab9-0e103692f056.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f828aedb0a0c351f4aeceab2bdf8ed473c334af15a818e1542b82efeab7b2f9b +size 58656 diff --git a/.lancedb/nltk_chunking.lance/data/ae7f6af7-9cc2-4eec-abfa-fc98e316166a.lance b/.lancedb/nltk_chunking.lance/data/ae7f6af7-9cc2-4eec-abfa-fc98e316166a.lance new file mode 100644 index 0000000000000000000000000000000000000000..fbd00f0ca1e4f803d1763037bf26befca9faef39 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ae7f6af7-9cc2-4eec-abfa-fc98e316166a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c0f0472733e4d20b3a516b6ad024de6ba160cb2636fba5d377d259284c002cf +size 56924 diff --git a/.lancedb/nltk_chunking.lance/data/ae8d82b4-82ab-49b7-a91c-ecab401c43db.lance b/.lancedb/nltk_chunking.lance/data/ae8d82b4-82ab-49b7-a91c-ecab401c43db.lance new file mode 100644 index 0000000000000000000000000000000000000000..926f5a6d9ff84552df04348721ec7cb64deb77b7 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ae8d82b4-82ab-49b7-a91c-ecab401c43db.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c23db9fab94fc6b37eb89b4b4ce3ba68729ecdb6b69a6c39e8490e4730e691a0 +size 53628 diff --git a/.lancedb/nltk_chunking.lance/data/af1e3d64-f9e8-449a-bd99-15357bc13f14.lance b/.lancedb/nltk_chunking.lance/data/af1e3d64-f9e8-449a-bd99-15357bc13f14.lance new file mode 100644 index 0000000000000000000000000000000000000000..ed29a04041d58bdecfcccb59191d8a785713d3ba --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/af1e3d64-f9e8-449a-bd99-15357bc13f14.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b218e94a4e46f3afe00201e9239c0ce5c1cca13695581199c5c77f47edd8ff17 +size 55073 diff --git a/.lancedb/nltk_chunking.lance/data/af34847c-6ac9-499b-a027-4b4abed6901b.lance b/.lancedb/nltk_chunking.lance/data/af34847c-6ac9-499b-a027-4b4abed6901b.lance new file mode 100644 index 0000000000000000000000000000000000000000..c83a5fe68d6b57240d6b4daad03f5bcbb34eda12 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/af34847c-6ac9-499b-a027-4b4abed6901b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c20fa3fbc594c856c9aa8941cc4321287d7d8ea77c163edb6baf71fc11f4b149 +size 58659 diff --git a/.lancedb/nltk_chunking.lance/data/af4ad824-5b5b-44d1-bd7f-0c0d056d1f23.lance b/.lancedb/nltk_chunking.lance/data/af4ad824-5b5b-44d1-bd7f-0c0d056d1f23.lance new file mode 100644 index 0000000000000000000000000000000000000000..4e5c9cb82e2079e22ecc1527b74b8cb85f649033 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/af4ad824-5b5b-44d1-bd7f-0c0d056d1f23.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b539af0dfd243bfd6fa7b4716c8e247a58d87ea436ff11e38e39284e748e980d +size 54263 diff --git a/.lancedb/nltk_chunking.lance/data/afe731a9-9afd-4d7d-bf89-6f400cf3a95a.lance b/.lancedb/nltk_chunking.lance/data/afe731a9-9afd-4d7d-bf89-6f400cf3a95a.lance new file mode 100644 index 0000000000000000000000000000000000000000..a3016effb8b908622e27fe8476220b1b4c05a40e --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/afe731a9-9afd-4d7d-bf89-6f400cf3a95a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:743ddb7fb78e9d8b60bd7316241fddc6216fb49e01da88d313afd878ad99c19a +size 56622 diff --git a/.lancedb/nltk_chunking.lance/data/b0171055-376d-4977-b436-04ed77488435.lance b/.lancedb/nltk_chunking.lance/data/b0171055-376d-4977-b436-04ed77488435.lance new file mode 100644 index 0000000000000000000000000000000000000000..ad8b5d5e5e5cc50fc09bf3585fbf6285d85ba8d1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b0171055-376d-4977-b436-04ed77488435.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585bc074c9b4962087eb09a6b84502cf5e7a514e78758c794b8fb4baa93d3051 +size 57142 diff --git a/.lancedb/nltk_chunking.lance/data/b04f1732-549a-466b-9462-44661e9b765e.lance b/.lancedb/nltk_chunking.lance/data/b04f1732-549a-466b-9462-44661e9b765e.lance new file mode 100644 index 0000000000000000000000000000000000000000..2cf6a91a9e39d9a2a887b6bae7763452f9ccb705 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b04f1732-549a-466b-9462-44661e9b765e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69eb429964632310776a789bc0cdd1268ec12885c8d79ad3212ec5ae9fb89eed +size 56048 diff --git a/.lancedb/nltk_chunking.lance/data/b22f6a45-c4f6-4bfe-9960-d88bb91547cd.lance b/.lancedb/nltk_chunking.lance/data/b22f6a45-c4f6-4bfe-9960-d88bb91547cd.lance new file mode 100644 index 0000000000000000000000000000000000000000..98aa96f8fa6a6145004d683dd4f066f05cc9c14a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b22f6a45-c4f6-4bfe-9960-d88bb91547cd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec429889a67f3f85917c1d5ce5ee4cae0dde49d4a6cf8fd223838bc9400540ef +size 57456 diff --git a/.lancedb/nltk_chunking.lance/data/b2d2882d-6fd8-4d19-8dff-4bdca1b0ff5a.lance b/.lancedb/nltk_chunking.lance/data/b2d2882d-6fd8-4d19-8dff-4bdca1b0ff5a.lance new file mode 100644 index 0000000000000000000000000000000000000000..96480f2804c027a20b76b25bf657d0d07ccbc99a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b2d2882d-6fd8-4d19-8dff-4bdca1b0ff5a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a52c156237782c9921edec45efa93a8f57a6b39a77665337ae0836df028f40b +size 53951 diff --git a/.lancedb/nltk_chunking.lance/data/b361c104-336f-4dc7-96bf-c32acfcdbac5.lance b/.lancedb/nltk_chunking.lance/data/b361c104-336f-4dc7-96bf-c32acfcdbac5.lance new file mode 100644 index 0000000000000000000000000000000000000000..8bf2359e0c97aa7beb9069dd61563797bfaf53d9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b361c104-336f-4dc7-96bf-c32acfcdbac5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca9628c365f602adffc698dedfc9e895e8a40b567a2cc0b82ad58aaafcef5163 +size 54817 diff --git a/.lancedb/nltk_chunking.lance/data/b3dba73e-a72f-4cdd-a453-5d23a202bcc0.lance b/.lancedb/nltk_chunking.lance/data/b3dba73e-a72f-4cdd-a453-5d23a202bcc0.lance new file mode 100644 index 0000000000000000000000000000000000000000..eae6a80dfbc2700d7d7415f5a2aeda78ca18fad5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b3dba73e-a72f-4cdd-a453-5d23a202bcc0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c9429529e73611b977be604619677caeb5e695df0caf0a8d147178ab18c4ae0 +size 57245 diff --git a/.lancedb/nltk_chunking.lance/data/b40378f5-30f5-4603-99f0-b727536d27f8.lance b/.lancedb/nltk_chunking.lance/data/b40378f5-30f5-4603-99f0-b727536d27f8.lance new file mode 100644 index 0000000000000000000000000000000000000000..130b2780795be2735c2822396a2160a619ec0a2a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b40378f5-30f5-4603-99f0-b727536d27f8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dfcc4f35aafb3b53720cc54f2232cbc9c717eb62112eb2ba3c3455c118d7c7f +size 54745 diff --git a/.lancedb/nltk_chunking.lance/data/b4e0cf7f-00b8-4339-b04b-f55bf11a27e3.lance b/.lancedb/nltk_chunking.lance/data/b4e0cf7f-00b8-4339-b04b-f55bf11a27e3.lance new file mode 100644 index 0000000000000000000000000000000000000000..538d2bd386177574be0ba8aed5edf13d2d9eea90 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b4e0cf7f-00b8-4339-b04b-f55bf11a27e3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fe6120f06b6cce83e234a3a6415cadada243a51d8c7acfe8643831ec68fbaa5 +size 54444 diff --git a/.lancedb/nltk_chunking.lance/data/b619bbdc-ecef-45eb-bffe-de8b29aa3bc8.lance b/.lancedb/nltk_chunking.lance/data/b619bbdc-ecef-45eb-bffe-de8b29aa3bc8.lance new file mode 100644 index 0000000000000000000000000000000000000000..14c17d39b58eea9c56856bacb0706fa772140af0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b619bbdc-ecef-45eb-bffe-de8b29aa3bc8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a684bba287dd2e628e7ffb5cd5bfac1f72ad722c91b305aba25d84cf1c8fc622 +size 54121 diff --git a/.lancedb/nltk_chunking.lance/data/b7dcfa68-7e64-4406-926a-91627116a061.lance b/.lancedb/nltk_chunking.lance/data/b7dcfa68-7e64-4406-926a-91627116a061.lance new file mode 100644 index 0000000000000000000000000000000000000000..d1d2f9fee6b07e73d0a84d8b52936615c4d791cf --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b7dcfa68-7e64-4406-926a-91627116a061.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac63214c6ae7cd1734f2f43d61b8eb0ac48ac330be932dee48cf5f052244090e +size 54620 diff --git a/.lancedb/nltk_chunking.lance/data/b80118f3-c655-4793-8a3f-313e760174ab.lance b/.lancedb/nltk_chunking.lance/data/b80118f3-c655-4793-8a3f-313e760174ab.lance new file mode 100644 index 0000000000000000000000000000000000000000..7c6a50194e271728988487d44e559e1a960c3876 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b80118f3-c655-4793-8a3f-313e760174ab.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc3242ed0ec3707e2fd6dc21e4f8c7a76fa7c981533400f67ad298094251d8b8 +size 56008 diff --git a/.lancedb/nltk_chunking.lance/data/b81e14f7-82ce-476b-9506-b445a6e38a7d.lance b/.lancedb/nltk_chunking.lance/data/b81e14f7-82ce-476b-9506-b445a6e38a7d.lance new file mode 100644 index 0000000000000000000000000000000000000000..91bc2474a188bf2fb348694f8daf475afd4d26d5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b81e14f7-82ce-476b-9506-b445a6e38a7d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4128530c3f4246ac0293f94245af0ba67cd242ae41560872d903ad001546020 +size 55475 diff --git a/.lancedb/nltk_chunking.lance/data/b941df88-fcfd-4e22-b559-edc9de89dea6.lance b/.lancedb/nltk_chunking.lance/data/b941df88-fcfd-4e22-b559-edc9de89dea6.lance new file mode 100644 index 0000000000000000000000000000000000000000..1251566716d20a596b9d0be04d5043278078f897 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b941df88-fcfd-4e22-b559-edc9de89dea6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cee7f628e4d877a724c9233c3681ff24162f2c8e9a47df3b2aaea7808d60ab8 +size 54437 diff --git a/.lancedb/nltk_chunking.lance/data/b9440067-0014-401c-a899-9f6502057e4f.lance b/.lancedb/nltk_chunking.lance/data/b9440067-0014-401c-a899-9f6502057e4f.lance new file mode 100644 index 0000000000000000000000000000000000000000..8ce10c8a76ae3be514c2da7bc1c63e78559c0483 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b9440067-0014-401c-a899-9f6502057e4f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0583edcf6ef8bf71c6605c10e9a48aa471081a1dd3bdc532bd5d8a3a3b89f023 +size 55219 diff --git a/.lancedb/nltk_chunking.lance/data/b9fa2fa4-ab8c-47e7-bad5-ed8fbafc4bcd.lance b/.lancedb/nltk_chunking.lance/data/b9fa2fa4-ab8c-47e7-bad5-ed8fbafc4bcd.lance new file mode 100644 index 0000000000000000000000000000000000000000..1c07b66bdc145df6dfdda01ce4e1ec819056c913 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/b9fa2fa4-ab8c-47e7-bad5-ed8fbafc4bcd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d2fa74b7fe3ef8aae1d3f8dd1e393bf2e6d1dca6e9042a747858fae2329329d +size 55173 diff --git a/.lancedb/nltk_chunking.lance/data/bb4f69b5-45e4-483a-93ac-d7771946f15d.lance b/.lancedb/nltk_chunking.lance/data/bb4f69b5-45e4-483a-93ac-d7771946f15d.lance new file mode 100644 index 0000000000000000000000000000000000000000..ee538cf37ee6b20d8513ac0b3caf83e353afc07b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/bb4f69b5-45e4-483a-93ac-d7771946f15d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4129169b86f83c98526f332aed8c49c80b9d8a22a5680296a25030a27706c448 +size 53779 diff --git a/.lancedb/nltk_chunking.lance/data/bbce4973-aed4-4112-87f1-dee19dfc6d08.lance b/.lancedb/nltk_chunking.lance/data/bbce4973-aed4-4112-87f1-dee19dfc6d08.lance new file mode 100644 index 0000000000000000000000000000000000000000..19f6130b495f2516ee86d5fd239f45888a15fdb0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/bbce4973-aed4-4112-87f1-dee19dfc6d08.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:704603f2e561ffda249583e60487db899ed9f43367736f986a3ed14234cff6cf +size 54826 diff --git a/.lancedb/nltk_chunking.lance/data/bbd717fd-15a3-492c-a997-174b309a460c.lance b/.lancedb/nltk_chunking.lance/data/bbd717fd-15a3-492c-a997-174b309a460c.lance new file mode 100644 index 0000000000000000000000000000000000000000..3d49de87e52ee262b8d1be8b5751d0cb7fd847fd --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/bbd717fd-15a3-492c-a997-174b309a460c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e2adb01d32fd5a351f8ada51522262c61941b74e92652ac7a09046da0e97241 +size 57360 diff --git a/.lancedb/nltk_chunking.lance/data/bc1bea7d-aa2b-4080-94ea-324b7c31dfa3.lance b/.lancedb/nltk_chunking.lance/data/bc1bea7d-aa2b-4080-94ea-324b7c31dfa3.lance new file mode 100644 index 0000000000000000000000000000000000000000..aedf05198be8ea984968752586e30aa27c833421 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/bc1bea7d-aa2b-4080-94ea-324b7c31dfa3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b4ebd472770d00cae062dff4065ff298129dc18728d189f7ab4924b1a0cd412 +size 57469 diff --git a/.lancedb/nltk_chunking.lance/data/bc3ad613-794d-42a0-b5c2-98e01ca4ca43.lance b/.lancedb/nltk_chunking.lance/data/bc3ad613-794d-42a0-b5c2-98e01ca4ca43.lance new file mode 100644 index 0000000000000000000000000000000000000000..7bb26e141d58ba4f3a39986091f8bd0a4df929c3 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/bc3ad613-794d-42a0-b5c2-98e01ca4ca43.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7248ee364caa5d5591306acab7ba25b0318bd20a9ee7958f8768e13850b0afe2 +size 58983 diff --git a/.lancedb/nltk_chunking.lance/data/bd38c7bc-c9a2-4496-909a-1c69746e53fd.lance b/.lancedb/nltk_chunking.lance/data/bd38c7bc-c9a2-4496-909a-1c69746e53fd.lance new file mode 100644 index 0000000000000000000000000000000000000000..8bd2fd9467fc02a49b6c19ba96ad379d2ed37c14 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/bd38c7bc-c9a2-4496-909a-1c69746e53fd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f14182e28a8172015459481e231328e89da2190ff57a2a4d7cf4468c89d9f32 +size 56051 diff --git a/.lancedb/nltk_chunking.lance/data/bde43530-ebc0-4d19-9592-229a4b67a8ab.lance b/.lancedb/nltk_chunking.lance/data/bde43530-ebc0-4d19-9592-229a4b67a8ab.lance new file mode 100644 index 0000000000000000000000000000000000000000..c1c17d66b19e863426d9e934015ae562f01244da --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/bde43530-ebc0-4d19-9592-229a4b67a8ab.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d606134df6182d815b40e53738162090e58cb83e579949fd00a9007767c6d924 +size 56241 diff --git a/.lancedb/nltk_chunking.lance/data/bfa733cf-4e61-4ad2-a0e1-1fd8546cfa82.lance b/.lancedb/nltk_chunking.lance/data/bfa733cf-4e61-4ad2-a0e1-1fd8546cfa82.lance new file mode 100644 index 0000000000000000000000000000000000000000..0ce0a7bbd58489a7bf26224a2ee0352c5f3185da --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/bfa733cf-4e61-4ad2-a0e1-1fd8546cfa82.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db9b3492b8cbd98c329bc685be83a9796628c0b8f3d83f42046144650275a725 +size 54370 diff --git a/.lancedb/nltk_chunking.lance/data/c0016efa-abb3-4714-a3af-3e85c73efed3.lance b/.lancedb/nltk_chunking.lance/data/c0016efa-abb3-4714-a3af-3e85c73efed3.lance new file mode 100644 index 0000000000000000000000000000000000000000..f8f985067122ef513d10cce4b9c800c391fcbdcc --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c0016efa-abb3-4714-a3af-3e85c73efed3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68741c84dd7efee9cf59e9f18bf3307c4ad11480c38d9a9e1afa1afc6061ac48 +size 54447 diff --git a/.lancedb/nltk_chunking.lance/data/c2e7531f-542e-4352-9d1b-c746981c75b5.lance b/.lancedb/nltk_chunking.lance/data/c2e7531f-542e-4352-9d1b-c746981c75b5.lance new file mode 100644 index 0000000000000000000000000000000000000000..899991acac89fcf6cf93711f27b2403ac16c20d6 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c2e7531f-542e-4352-9d1b-c746981c75b5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78fec3b7e826fec6a03bd77f3a11b335da72584ceca3464da23e2e680c039d14 +size 54387 diff --git a/.lancedb/nltk_chunking.lance/data/c4331a0b-f744-47a6-a42a-0d90a93c0058.lance b/.lancedb/nltk_chunking.lance/data/c4331a0b-f744-47a6-a42a-0d90a93c0058.lance new file mode 100644 index 0000000000000000000000000000000000000000..c4c7a6eb55ee50c0e6b0825d668182c37c91cdd0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c4331a0b-f744-47a6-a42a-0d90a93c0058.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c6b6a714984dc3c71be3d2184e590aecc403c92d2a7a70094f17fe419c8be59 +size 54469 diff --git a/.lancedb/nltk_chunking.lance/data/c5a494ec-e0ad-41d1-947d-80c7829686ce.lance b/.lancedb/nltk_chunking.lance/data/c5a494ec-e0ad-41d1-947d-80c7829686ce.lance new file mode 100644 index 0000000000000000000000000000000000000000..59377cc51824ce9b94cb42b7cc919c5fcfbd7b7a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c5a494ec-e0ad-41d1-947d-80c7829686ce.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c7c4ba4be08bc00ccff2bb976fbcb97f0f077bb29f637a5c960374884e74655 +size 62774 diff --git a/.lancedb/nltk_chunking.lance/data/c637a597-00da-4dec-9287-6d91c46ad2ae.lance b/.lancedb/nltk_chunking.lance/data/c637a597-00da-4dec-9287-6d91c46ad2ae.lance new file mode 100644 index 0000000000000000000000000000000000000000..17ceaeef5243b3079bb8818b1401c578de97ad42 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c637a597-00da-4dec-9287-6d91c46ad2ae.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55ad6e4ec980e8ed08ad4b3d5057761ab1179cb2f20cc2b2f8fed20a86886b39 +size 53641 diff --git a/.lancedb/nltk_chunking.lance/data/c6848fbf-8474-41cc-8bf2-da9759cd8b2e.lance b/.lancedb/nltk_chunking.lance/data/c6848fbf-8474-41cc-8bf2-da9759cd8b2e.lance new file mode 100644 index 0000000000000000000000000000000000000000..b8f3202f9581a7bc49b4851c2b20a612188d0ffc --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c6848fbf-8474-41cc-8bf2-da9759cd8b2e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9517918dfea564b86bcf798c99ee5d2ed5e29e00c7e0c0eee1256d0e937df7d0 +size 54786 diff --git a/.lancedb/nltk_chunking.lance/data/c78bc692-f42d-42b2-a131-dfa6eb7f79e9.lance b/.lancedb/nltk_chunking.lance/data/c78bc692-f42d-42b2-a131-dfa6eb7f79e9.lance new file mode 100644 index 0000000000000000000000000000000000000000..24eca7cb0a39e6087d661fc19721b90c4a784d80 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c78bc692-f42d-42b2-a131-dfa6eb7f79e9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb31fb6a58a52deadd7622ed38703514b4dbeb749843962aac0e19d6aa1a6879 +size 55409 diff --git a/.lancedb/nltk_chunking.lance/data/c79e8737-acc7-49bb-a6d8-964e2bb82440.lance b/.lancedb/nltk_chunking.lance/data/c79e8737-acc7-49bb-a6d8-964e2bb82440.lance new file mode 100644 index 0000000000000000000000000000000000000000..bdbea55747f1190463d47c1cd69af795e4680eb5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c79e8737-acc7-49bb-a6d8-964e2bb82440.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0960af1ef3aec5eb8dd15f28b1abcef7cefc91336fc63e0106a173412a0e82c2 +size 56955 diff --git a/.lancedb/nltk_chunking.lance/data/c7d6a333-01b6-486c-ae6e-b031d117f5d6.lance b/.lancedb/nltk_chunking.lance/data/c7d6a333-01b6-486c-ae6e-b031d117f5d6.lance new file mode 100644 index 0000000000000000000000000000000000000000..8544610b30547b50838339c450400078069aaf02 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c7d6a333-01b6-486c-ae6e-b031d117f5d6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e80a9af38eb0f049b0147e527dd9fbb504c3d9c26bfe0f727cc38cd2955b6307 +size 55471 diff --git a/.lancedb/nltk_chunking.lance/data/c7e37773-f941-4bb8-a8d9-f8d1c87c7aec.lance b/.lancedb/nltk_chunking.lance/data/c7e37773-f941-4bb8-a8d9-f8d1c87c7aec.lance new file mode 100644 index 0000000000000000000000000000000000000000..8c85f3d61ab818c7ed900481126e1bf234e88423 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c7e37773-f941-4bb8-a8d9-f8d1c87c7aec.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7a43742a0cf6147fe07881ef53329270b15212191d4b1629fe97d794b3c6206 +size 53812 diff --git a/.lancedb/nltk_chunking.lance/data/c81d9649-1eb2-49a6-a7cd-18a0a07291ec.lance b/.lancedb/nltk_chunking.lance/data/c81d9649-1eb2-49a6-a7cd-18a0a07291ec.lance new file mode 100644 index 0000000000000000000000000000000000000000..cecafd875c4d9a2d16746283f0c08a3a0b3bb23b --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c81d9649-1eb2-49a6-a7cd-18a0a07291ec.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f2b9d739285488e7eec66efc8299a96e8c687503dfbd135ce63816962d0695d +size 54575 diff --git a/.lancedb/nltk_chunking.lance/data/c8e68e9e-a2cb-4d64-b162-53d0fb1904ec.lance b/.lancedb/nltk_chunking.lance/data/c8e68e9e-a2cb-4d64-b162-53d0fb1904ec.lance new file mode 100644 index 0000000000000000000000000000000000000000..f58ece27cb2bd611c1c0aedd616da3f2dac08768 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/c8e68e9e-a2cb-4d64-b162-53d0fb1904ec.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bc22aef9bb46eadf84bed8f9ce1aa11ab0ab5505bf95df16d4d5167b03799c1 +size 58441 diff --git a/.lancedb/nltk_chunking.lance/data/ca19392b-2446-45ce-adec-a0eebde82eff.lance b/.lancedb/nltk_chunking.lance/data/ca19392b-2446-45ce-adec-a0eebde82eff.lance new file mode 100644 index 0000000000000000000000000000000000000000..7b8cfeee3dbec37d8dac0d7327991752dfea9745 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ca19392b-2446-45ce-adec-a0eebde82eff.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b798ee7c29085455b4b42fde375319a066ad976089ac52c2f129b4f81f8eb237 +size 56596 diff --git a/.lancedb/nltk_chunking.lance/data/cb647dfe-1a98-4eb5-9427-c6cbe89702aa.lance b/.lancedb/nltk_chunking.lance/data/cb647dfe-1a98-4eb5-9427-c6cbe89702aa.lance new file mode 100644 index 0000000000000000000000000000000000000000..fc8ae47b9c1f4aae348c470e520c7f70e41bc1da --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/cb647dfe-1a98-4eb5-9427-c6cbe89702aa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d03804120e79b284ccf6b385133a17cc9830b62806f8f8778ed0b42a687c187e +size 52886 diff --git a/.lancedb/nltk_chunking.lance/data/cbec6761-2e60-4765-9503-37fa8d325cf1.lance b/.lancedb/nltk_chunking.lance/data/cbec6761-2e60-4765-9503-37fa8d325cf1.lance new file mode 100644 index 0000000000000000000000000000000000000000..b91d17d237d9e9a6e1171c855da01b21fbae11aa --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/cbec6761-2e60-4765-9503-37fa8d325cf1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:206784781c02322a8435ce12372a700f51552f5393ac6d6714e08e8912bbeffa +size 55174 diff --git a/.lancedb/nltk_chunking.lance/data/cc04f467-ed67-4f1d-b767-7b54a81899c2.lance b/.lancedb/nltk_chunking.lance/data/cc04f467-ed67-4f1d-b767-7b54a81899c2.lance new file mode 100644 index 0000000000000000000000000000000000000000..60adbe7a5a74f3e401430071ab1cd19cccb477ce --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/cc04f467-ed67-4f1d-b767-7b54a81899c2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1392cfcc85c2e5f5a4e4dfa78bf7a3a36e093b9914aa025979b35819a0ccd5d7 +size 57069 diff --git a/.lancedb/nltk_chunking.lance/data/cd43809b-581d-470f-a496-54dc82e72dd9.lance b/.lancedb/nltk_chunking.lance/data/cd43809b-581d-470f-a496-54dc82e72dd9.lance new file mode 100644 index 0000000000000000000000000000000000000000..b3e3b524f0daa7abba004f4e6c83c1c71f90ff31 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/cd43809b-581d-470f-a496-54dc82e72dd9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:970ab2297e53213c83502bc260be1960c91c86022cbfba89842e800c078c6fa4 +size 57827 diff --git a/.lancedb/nltk_chunking.lance/data/ce949c30-777e-4725-99a9-112fb7b26f9f.lance b/.lancedb/nltk_chunking.lance/data/ce949c30-777e-4725-99a9-112fb7b26f9f.lance new file mode 100644 index 0000000000000000000000000000000000000000..5e470f4d66883d681ffdaba5bae4724f9fdb6f59 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ce949c30-777e-4725-99a9-112fb7b26f9f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1508518beea8dfbdb8a7614294eea4c9b88af1de48905e6993c71d1e54147f1a +size 56069 diff --git a/.lancedb/nltk_chunking.lance/data/d0c875c5-3822-4b42-9a8a-d82e893d2c08.lance b/.lancedb/nltk_chunking.lance/data/d0c875c5-3822-4b42-9a8a-d82e893d2c08.lance new file mode 100644 index 0000000000000000000000000000000000000000..52c53d9b44e85faefa0f27832f0b5cd70275e2df --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/d0c875c5-3822-4b42-9a8a-d82e893d2c08.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56e13426eecdc666768c8a748543e7346a84aec3a254920990bd0ed2a2570db4 +size 56755 diff --git a/.lancedb/nltk_chunking.lance/data/d1f708de-f25a-41c9-8165-c1cb481ff216.lance b/.lancedb/nltk_chunking.lance/data/d1f708de-f25a-41c9-8165-c1cb481ff216.lance new file mode 100644 index 0000000000000000000000000000000000000000..212e2b393c8ebb0c5c7bd840f5a7bb495c4fde70 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/d1f708de-f25a-41c9-8165-c1cb481ff216.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:288cdadc058b7a011d0ceb8484e1f27c8618dd70252a6d9b50d4b3e75e00b9de +size 57387 diff --git a/.lancedb/nltk_chunking.lance/data/d212f889-edc3-494b-9733-266f8d14c2c7.lance b/.lancedb/nltk_chunking.lance/data/d212f889-edc3-494b-9733-266f8d14c2c7.lance new file mode 100644 index 0000000000000000000000000000000000000000..fbd80afdbb70b5eb5f03529bebff68b61d14f654 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/d212f889-edc3-494b-9733-266f8d14c2c7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdc60c0fb13ff562ebea6183a40657211688d49a7a45a0d10b28c7b8b852336c +size 55855 diff --git a/.lancedb/nltk_chunking.lance/data/d2521c80-116e-4800-b282-923c1ac8040f.lance b/.lancedb/nltk_chunking.lance/data/d2521c80-116e-4800-b282-923c1ac8040f.lance new file mode 100644 index 0000000000000000000000000000000000000000..03a4a5c54b26381c54aae16855693794ce62b8d2 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/d2521c80-116e-4800-b282-923c1ac8040f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63051b14da1c2c11be5895ebff6cbea11e47ad9992d3939764c034f6fa1128aa +size 55471 diff --git a/.lancedb/nltk_chunking.lance/data/d4cfd316-1cd5-4226-b92d-d528d7340ed3.lance b/.lancedb/nltk_chunking.lance/data/d4cfd316-1cd5-4226-b92d-d528d7340ed3.lance new file mode 100644 index 0000000000000000000000000000000000000000..a98cba96f8ca13f07ed607f3a2c5df2008e0d7fe --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/d4cfd316-1cd5-4226-b92d-d528d7340ed3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bcce542c77be1a3f138be1101bad68a0f5321f7c72a144d119a6ee65b725372 +size 55742 diff --git a/.lancedb/nltk_chunking.lance/data/d61b6a6d-2835-4ec5-bf45-19e32380589c.lance b/.lancedb/nltk_chunking.lance/data/d61b6a6d-2835-4ec5-bf45-19e32380589c.lance new file mode 100644 index 0000000000000000000000000000000000000000..87f5c7712082e4d268a5a0aa4d1b239c37054afd --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/d61b6a6d-2835-4ec5-bf45-19e32380589c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2f4bf7b483333c7db9918f79c2c65701353464ba1330a78c1332500554dd9b2 +size 53518 diff --git a/.lancedb/nltk_chunking.lance/data/d65fd169-f5ec-465f-98ab-fb9c524f2ef9.lance b/.lancedb/nltk_chunking.lance/data/d65fd169-f5ec-465f-98ab-fb9c524f2ef9.lance new file mode 100644 index 0000000000000000000000000000000000000000..7eb555e0cafbe8472b8d274eb8a8d1cf53c5b57a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/d65fd169-f5ec-465f-98ab-fb9c524f2ef9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83bde6c62123f1a600fe680f08f6420fb7dd17b8aacc458c7b492121ec314ed4 +size 55705 diff --git a/.lancedb/nltk_chunking.lance/data/d8c77a99-903b-4287-86fc-f0b2cc896dff.lance b/.lancedb/nltk_chunking.lance/data/d8c77a99-903b-4287-86fc-f0b2cc896dff.lance new file mode 100644 index 0000000000000000000000000000000000000000..355d43f835928bf748804f1afc3fb1babcde98d5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/d8c77a99-903b-4287-86fc-f0b2cc896dff.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6cbb937dff629f7bc3c564d62b39c27390e45234c67ad165aa3fc798105c0a1 +size 55395 diff --git a/.lancedb/nltk_chunking.lance/data/daeaf599-2f4b-40e3-99e8-d56129344b09.lance b/.lancedb/nltk_chunking.lance/data/daeaf599-2f4b-40e3-99e8-d56129344b09.lance new file mode 100644 index 0000000000000000000000000000000000000000..108ecf098e410bd7605275e92a1c8cf51d165970 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/daeaf599-2f4b-40e3-99e8-d56129344b09.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ed360f876abefd844a8fdb23d6b2b2ac1886fff334b3e1e2d97eafc9f470ce4 +size 60892 diff --git a/.lancedb/nltk_chunking.lance/data/db0af45d-8359-41d0-a6e9-301cd2fd5a41.lance b/.lancedb/nltk_chunking.lance/data/db0af45d-8359-41d0-a6e9-301cd2fd5a41.lance new file mode 100644 index 0000000000000000000000000000000000000000..9d5c3c876f8929202520ee2efd8461f9e4ce34f2 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/db0af45d-8359-41d0-a6e9-301cd2fd5a41.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26ed51a1e17cbaf944ec636010fb81ab6e069bce5abe84730e893d4dbf48b6dd +size 58164 diff --git a/.lancedb/nltk_chunking.lance/data/dc595167-2d69-49bd-a7e8-039af4f95004.lance b/.lancedb/nltk_chunking.lance/data/dc595167-2d69-49bd-a7e8-039af4f95004.lance new file mode 100644 index 0000000000000000000000000000000000000000..ad2947bc11073ec7c2b18cc31f2a00469da4b965 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/dc595167-2d69-49bd-a7e8-039af4f95004.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:205cc8a2ede62b360d960218bc99ba2fb7f7787f9fb988ff5848e3e59897f0c8 +size 54641 diff --git a/.lancedb/nltk_chunking.lance/data/dcd93fd4-5ab5-4d54-9c08-da51bf96076d.lance b/.lancedb/nltk_chunking.lance/data/dcd93fd4-5ab5-4d54-9c08-da51bf96076d.lance new file mode 100644 index 0000000000000000000000000000000000000000..ebe9b68b34c79a558d0edcc310d0ec9177d831df --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/dcd93fd4-5ab5-4d54-9c08-da51bf96076d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e15653acd691946a8c7585f75a3553c17724c1a6ab85d01253a691b83e9f5c2 +size 55764 diff --git a/.lancedb/nltk_chunking.lance/data/dd3e0ae2-6c99-42e9-bfdc-609a2ceb1670.lance b/.lancedb/nltk_chunking.lance/data/dd3e0ae2-6c99-42e9-bfdc-609a2ceb1670.lance new file mode 100644 index 0000000000000000000000000000000000000000..3767e7d58a794f0f08c5804c9972109889677922 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/dd3e0ae2-6c99-42e9-bfdc-609a2ceb1670.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81432c18a97b8bd50737bc1666283af332e36266c14a5e2a738c30ef8fa7abe4 +size 54853 diff --git a/.lancedb/nltk_chunking.lance/data/dd42ae22-ebf8-483a-921b-ab4f5c3f331a.lance b/.lancedb/nltk_chunking.lance/data/dd42ae22-ebf8-483a-921b-ab4f5c3f331a.lance new file mode 100644 index 0000000000000000000000000000000000000000..ec8d120f8b4e6adbc6b5fee794c2cc2d997ecc3d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/dd42ae22-ebf8-483a-921b-ab4f5c3f331a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efb80970c3f999c365f38bffb4b0974b8db8d3d24b898ee487a20daf9b455452 +size 53681 diff --git a/.lancedb/nltk_chunking.lance/data/ddf9b923-1e9d-42dc-a8cd-2656046808f1.lance b/.lancedb/nltk_chunking.lance/data/ddf9b923-1e9d-42dc-a8cd-2656046808f1.lance new file mode 100644 index 0000000000000000000000000000000000000000..2b91dbbe8176ad5964d5e57a97d2e54c038d8d1d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ddf9b923-1e9d-42dc-a8cd-2656046808f1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0526216cd8f4a351bddc0e2cedbd97f9182297064c2984fb30baa485af74e462 +size 55042 diff --git a/.lancedb/nltk_chunking.lance/data/df2e76c0-277c-4ca4-b1bd-8a670ea34c9e.lance b/.lancedb/nltk_chunking.lance/data/df2e76c0-277c-4ca4-b1bd-8a670ea34c9e.lance new file mode 100644 index 0000000000000000000000000000000000000000..dc75af1a9cf6b8b2f3c398eab12263f987a2fb48 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/df2e76c0-277c-4ca4-b1bd-8a670ea34c9e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edd2a1b68c71de545d56b711be39cbc94e20e6b35948b262cd30bc7f18c71825 +size 54462 diff --git a/.lancedb/nltk_chunking.lance/data/dfad277e-5911-48a7-a8e0-91107d50a295.lance b/.lancedb/nltk_chunking.lance/data/dfad277e-5911-48a7-a8e0-91107d50a295.lance new file mode 100644 index 0000000000000000000000000000000000000000..77e58bd2d0482dfd9ce9fda31353b7f105dde1ab --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/dfad277e-5911-48a7-a8e0-91107d50a295.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65285d0c5708649dec4716d086b4b2aeafaafac391b10990a98320ba4a5306bc +size 55767 diff --git a/.lancedb/nltk_chunking.lance/data/e013a31a-e0bc-4b51-b328-e0881554325c.lance b/.lancedb/nltk_chunking.lance/data/e013a31a-e0bc-4b51-b328-e0881554325c.lance new file mode 100644 index 0000000000000000000000000000000000000000..08bcabc988605a208d9b0a1e2e47cd28d842d953 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e013a31a-e0bc-4b51-b328-e0881554325c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15c2d3c0ef2dd164b38f5168f4d0159ffcbb2281b71762ffdd91617a74864913 +size 54703 diff --git a/.lancedb/nltk_chunking.lance/data/e036b15a-de03-460e-a1af-594bf468837b.lance b/.lancedb/nltk_chunking.lance/data/e036b15a-de03-460e-a1af-594bf468837b.lance new file mode 100644 index 0000000000000000000000000000000000000000..33f4111cd5c642eb9dd51d1d1a61c1e9e5ff205a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e036b15a-de03-460e-a1af-594bf468837b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dd4e1e1a630fb9c0565e2ec97c69e2d8f40213118aed7840ec643e8b895de09 +size 54531 diff --git a/.lancedb/nltk_chunking.lance/data/e099ed00-8014-4467-ba2c-a1247c49bd6b.lance b/.lancedb/nltk_chunking.lance/data/e099ed00-8014-4467-ba2c-a1247c49bd6b.lance new file mode 100644 index 0000000000000000000000000000000000000000..471dca71db835f117280c6ce9098c2288b7983ed --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e099ed00-8014-4467-ba2c-a1247c49bd6b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46027743e2f6b3111c877e6f22cfc760549f4fd7d114f1501d0eba71c82825b8 +size 54579 diff --git a/.lancedb/nltk_chunking.lance/data/e0be814b-f7f6-4db0-a79f-9465aba7797b.lance b/.lancedb/nltk_chunking.lance/data/e0be814b-f7f6-4db0-a79f-9465aba7797b.lance new file mode 100644 index 0000000000000000000000000000000000000000..c15038cbee79829c99a0551ddaaed2591bae2222 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e0be814b-f7f6-4db0-a79f-9465aba7797b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d5fdb5ca54af7a59cde84884222800e8c156395d35aa5194b4268b7a4d6026f +size 53389 diff --git a/.lancedb/nltk_chunking.lance/data/e1a3996d-481e-4a35-8c30-cdaee6abbbaa.lance b/.lancedb/nltk_chunking.lance/data/e1a3996d-481e-4a35-8c30-cdaee6abbbaa.lance new file mode 100644 index 0000000000000000000000000000000000000000..2678008f9797b29688cdf305135c52c48dbc18c4 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e1a3996d-481e-4a35-8c30-cdaee6abbbaa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12e47c6ec538f045f74b2cf2278bf92b4789d94b1b505af97ea6f5bf78e82b6f +size 56675 diff --git a/.lancedb/nltk_chunking.lance/data/e1b11d56-8f08-48dd-8cc6-ac491f9f5b58.lance b/.lancedb/nltk_chunking.lance/data/e1b11d56-8f08-48dd-8cc6-ac491f9f5b58.lance new file mode 100644 index 0000000000000000000000000000000000000000..e0d9428b07185e479ee19f8891af160315c450b8 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e1b11d56-8f08-48dd-8cc6-ac491f9f5b58.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c38f191a266520cae2a4e16c83226bfc07f4755a8101b99bcd02872ef92cba0a +size 57221 diff --git a/.lancedb/nltk_chunking.lance/data/e2c7e7af-2056-4829-90f2-edf7b5aad300.lance b/.lancedb/nltk_chunking.lance/data/e2c7e7af-2056-4829-90f2-edf7b5aad300.lance new file mode 100644 index 0000000000000000000000000000000000000000..86f9bda31877cc9825a552c8d3e2a2a92a34e371 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e2c7e7af-2056-4829-90f2-edf7b5aad300.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86c6d74caf4008cf5dd222fa0f4bec3acf0a57a0338931e13e241979a1ae615a +size 53988 diff --git a/.lancedb/nltk_chunking.lance/data/e58913a3-347e-43ca-b7ed-d8294d4c6c45.lance b/.lancedb/nltk_chunking.lance/data/e58913a3-347e-43ca-b7ed-d8294d4c6c45.lance new file mode 100644 index 0000000000000000000000000000000000000000..e8e9201821d85ca4c024179867e21f3748b9c230 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e58913a3-347e-43ca-b7ed-d8294d4c6c45.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f6e52f8d2ea5a22488eb0a390ad2ce0ee58aeca94814ea2b55eeb949cf2efad +size 55096 diff --git a/.lancedb/nltk_chunking.lance/data/e607538a-024e-44ee-84ad-4151cb6354c8.lance b/.lancedb/nltk_chunking.lance/data/e607538a-024e-44ee-84ad-4151cb6354c8.lance new file mode 100644 index 0000000000000000000000000000000000000000..b219afc93561b9342601fab96dbf3ad3899a2ec0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e607538a-024e-44ee-84ad-4151cb6354c8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:384f8517107d4a72fe35465798a2e2318e602af524f4c8b8dabc26822b921fcd +size 53855 diff --git a/.lancedb/nltk_chunking.lance/data/e647d3f4-4aa3-4a89-b714-46ce32803945.lance b/.lancedb/nltk_chunking.lance/data/e647d3f4-4aa3-4a89-b714-46ce32803945.lance new file mode 100644 index 0000000000000000000000000000000000000000..85992872d7507bbb785123f85cffb30b038cf2c1 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e647d3f4-4aa3-4a89-b714-46ce32803945.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:816ccad3c692c1cb1fa2a9159016529409f9ccf9ae74ecf1e45a76767acee902 +size 54626 diff --git a/.lancedb/nltk_chunking.lance/data/e6da7321-65d1-4677-bd85-00f4d7f9a819.lance b/.lancedb/nltk_chunking.lance/data/e6da7321-65d1-4677-bd85-00f4d7f9a819.lance new file mode 100644 index 0000000000000000000000000000000000000000..deafa4e5ebf3b76979e5d121b61d1b04d2494236 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e6da7321-65d1-4677-bd85-00f4d7f9a819.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e454543b9ccdb54bfa60f51f76414e4c3cad78adf2da213e4fc72c9f54ac95f +size 57141 diff --git a/.lancedb/nltk_chunking.lance/data/e7648127-0779-4019-a6c2-377fd9d4f1e2.lance b/.lancedb/nltk_chunking.lance/data/e7648127-0779-4019-a6c2-377fd9d4f1e2.lance new file mode 100644 index 0000000000000000000000000000000000000000..ec6fdf75eb645d11f205d2d410d599bc47fca5b4 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e7648127-0779-4019-a6c2-377fd9d4f1e2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26f61c72f4459b0d9d64e421fa385e0ab17b41f2aa24feadbaddae4828a8de14 +size 54398 diff --git a/.lancedb/nltk_chunking.lance/data/e826b1bb-8b78-439b-8087-f0a7b0898975.lance b/.lancedb/nltk_chunking.lance/data/e826b1bb-8b78-439b-8087-f0a7b0898975.lance new file mode 100644 index 0000000000000000000000000000000000000000..52ae825de9ce15e270c51645f4b71b4c05d0e58a --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e826b1bb-8b78-439b-8087-f0a7b0898975.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9ccd437831ec4d20fe45d60338534da9a04d1cbf1ade748523744f1c8af2dec +size 57113 diff --git a/.lancedb/nltk_chunking.lance/data/e9b2004a-d767-4e4d-b1b3-3e5ef2a3fa82.lance b/.lancedb/nltk_chunking.lance/data/e9b2004a-d767-4e4d-b1b3-3e5ef2a3fa82.lance new file mode 100644 index 0000000000000000000000000000000000000000..924d75c1716797c08d80f9742136e00a4825ae38 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e9b2004a-d767-4e4d-b1b3-3e5ef2a3fa82.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aa67fcc0b616f8e2b5524d5a839ccaf7f4a34ad432c0d1a1d13be688cde981c +size 58879 diff --git a/.lancedb/nltk_chunking.lance/data/e9bfb81a-d016-4025-ab39-b479a86f42ce.lance b/.lancedb/nltk_chunking.lance/data/e9bfb81a-d016-4025-ab39-b479a86f42ce.lance new file mode 100644 index 0000000000000000000000000000000000000000..652a62cc0be76a50ba78cb7771b34d1b9db3f862 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e9bfb81a-d016-4025-ab39-b479a86f42ce.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31e3551d3f2e826251b51c3c574d2f2a250c5a4696de88ed63a821cf78112d04 +size 55171 diff --git a/.lancedb/nltk_chunking.lance/data/e9ec36f2-dd01-447a-8b60-063c5a9366b0.lance b/.lancedb/nltk_chunking.lance/data/e9ec36f2-dd01-447a-8b60-063c5a9366b0.lance new file mode 100644 index 0000000000000000000000000000000000000000..28d3014424d4be7da9a49d592d0d861e04e87ee0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/e9ec36f2-dd01-447a-8b60-063c5a9366b0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f91a9f0755a921826aa9b689c520e7e01082ae4a9febd469c438edf1c99a9342 +size 58909 diff --git a/.lancedb/nltk_chunking.lance/data/ea023a60-c173-4281-bc9e-1f965147ceb1.lance b/.lancedb/nltk_chunking.lance/data/ea023a60-c173-4281-bc9e-1f965147ceb1.lance new file mode 100644 index 0000000000000000000000000000000000000000..8e327fafaf9625a47d5790c3ee90e7080132045f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ea023a60-c173-4281-bc9e-1f965147ceb1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cefb3d74a94d07d2b746c017bdfe2206495589d176fc1e2c85200fcc4f22ff52 +size 54323 diff --git a/.lancedb/nltk_chunking.lance/data/ea37b0c1-2822-4b9e-9ff6-55d3967deb72.lance b/.lancedb/nltk_chunking.lance/data/ea37b0c1-2822-4b9e-9ff6-55d3967deb72.lance new file mode 100644 index 0000000000000000000000000000000000000000..749af8511322fd8f55c4fe9e84619b4d48da4446 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ea37b0c1-2822-4b9e-9ff6-55d3967deb72.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9df90e420debe59a1ca33e7a5cdf6bb106d87f4f9a6955dec51fc1848ba4f24 +size 54084 diff --git a/.lancedb/nltk_chunking.lance/data/eb22ff45-0267-4b88-96cd-0d97201cf98b.lance b/.lancedb/nltk_chunking.lance/data/eb22ff45-0267-4b88-96cd-0d97201cf98b.lance new file mode 100644 index 0000000000000000000000000000000000000000..2a619c529c56dfc4c3d0f1f79f66ea497fa9d941 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/eb22ff45-0267-4b88-96cd-0d97201cf98b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5188755947481fc6a06aa680e829c60a99cbbe059a640c4a9b0778fba8e7b460 +size 55992 diff --git a/.lancedb/nltk_chunking.lance/data/eba4e892-2f18-43e3-bce2-9a03c7a3d6d7.lance b/.lancedb/nltk_chunking.lance/data/eba4e892-2f18-43e3-bce2-9a03c7a3d6d7.lance new file mode 100644 index 0000000000000000000000000000000000000000..870fe007dce01fde6115354525707acc32486b00 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/eba4e892-2f18-43e3-bce2-9a03c7a3d6d7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2830cb408721093cc414bf62207366e50bac6fdd9a95a9819d57a0fb0905746c +size 54727 diff --git a/.lancedb/nltk_chunking.lance/data/eba90ef0-0715-4c3d-b45a-c38ba4986701.lance b/.lancedb/nltk_chunking.lance/data/eba90ef0-0715-4c3d-b45a-c38ba4986701.lance new file mode 100644 index 0000000000000000000000000000000000000000..6b050330f756fc33f7b700798c6c0e08f507238f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/eba90ef0-0715-4c3d-b45a-c38ba4986701.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0eb6f8adc17754b819207e9852ae597051033f62cdc50917b3d300afeefc206 +size 58551 diff --git a/.lancedb/nltk_chunking.lance/data/ec028db0-c868-40b1-83f2-91e178b2d023.lance b/.lancedb/nltk_chunking.lance/data/ec028db0-c868-40b1-83f2-91e178b2d023.lance new file mode 100644 index 0000000000000000000000000000000000000000..c7b265339320bd5638c476256f24482c2aeeb67f --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ec028db0-c868-40b1-83f2-91e178b2d023.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:408e840958df6590ec8f6144fb39ccaa5f22f095ca193d8f818fd6c27e904a1c +size 56573 diff --git a/.lancedb/nltk_chunking.lance/data/edd1f2c8-f413-443a-a191-5eea8506e1bd.lance b/.lancedb/nltk_chunking.lance/data/edd1f2c8-f413-443a-a191-5eea8506e1bd.lance new file mode 100644 index 0000000000000000000000000000000000000000..f41f2d0038a610ded99ca94d4b397fc285c8ad86 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/edd1f2c8-f413-443a-a191-5eea8506e1bd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08fca51a15a521e308053e89524bf331293c0d4494c4941bce3528db254cf191 +size 54602 diff --git a/.lancedb/nltk_chunking.lance/data/ee0a50fd-85e5-4729-b1fb-5e9bc11231a4.lance b/.lancedb/nltk_chunking.lance/data/ee0a50fd-85e5-4729-b1fb-5e9bc11231a4.lance new file mode 100644 index 0000000000000000000000000000000000000000..5831ae5a1539ecaf6f6875f8faabfce327f6fed4 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ee0a50fd-85e5-4729-b1fb-5e9bc11231a4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34565003c58f5985c1b3cea66133989589e09039a3e59c84bd40e132ffcf2c0f +size 54388 diff --git a/.lancedb/nltk_chunking.lance/data/ee4bdacd-0c16-41cd-93f0-976b25daaa87.lance b/.lancedb/nltk_chunking.lance/data/ee4bdacd-0c16-41cd-93f0-976b25daaa87.lance new file mode 100644 index 0000000000000000000000000000000000000000..b4abb5084697498d7ce8002267ad562bc6dd37c5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ee4bdacd-0c16-41cd-93f0-976b25daaa87.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:507ebf64e3900f34d081506b7a7f042c2382153fb127a123e589785ab24506d1 +size 55497 diff --git a/.lancedb/nltk_chunking.lance/data/ee54e5dc-a6a4-4f18-90b2-162053b9a2e1.lance b/.lancedb/nltk_chunking.lance/data/ee54e5dc-a6a4-4f18-90b2-162053b9a2e1.lance new file mode 100644 index 0000000000000000000000000000000000000000..c4d2458ee39b585d474a33d19ce330bb997e2ce7 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ee54e5dc-a6a4-4f18-90b2-162053b9a2e1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa69269ee2b46d707e152afc2c386e95fe0cd7f5758394c09e2b62d6572747b3 +size 55096 diff --git a/.lancedb/nltk_chunking.lance/data/f1c799b3-bd8e-4744-960b-dab0358d5c52.lance b/.lancedb/nltk_chunking.lance/data/f1c799b3-bd8e-4744-960b-dab0358d5c52.lance new file mode 100644 index 0000000000000000000000000000000000000000..21c43a3f72d83abaf3dc8c3a3218a0a2c4bd7ea9 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/f1c799b3-bd8e-4744-960b-dab0358d5c52.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:679ac9fcfae5114026fd74d6a81df65e2351eec4c33f11937ed3be8e20801160 +size 55629 diff --git a/.lancedb/nltk_chunking.lance/data/f1e59874-e100-41b3-8f67-c495ff357357.lance b/.lancedb/nltk_chunking.lance/data/f1e59874-e100-41b3-8f67-c495ff357357.lance new file mode 100644 index 0000000000000000000000000000000000000000..e106bbdcf613b525eef1592eb8f23336aa689355 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/f1e59874-e100-41b3-8f67-c495ff357357.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:836475082d8d17325bf2c7e682c8e315e75174a6311a8fe7b572612157dc8e27 +size 56217 diff --git a/.lancedb/nltk_chunking.lance/data/f426dbc8-d3fb-4d29-82b0-c4abe26b6bb9.lance b/.lancedb/nltk_chunking.lance/data/f426dbc8-d3fb-4d29-82b0-c4abe26b6bb9.lance new file mode 100644 index 0000000000000000000000000000000000000000..c79bc66112798a890bf47e175b70672e1b0ff3e8 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/f426dbc8-d3fb-4d29-82b0-c4abe26b6bb9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a3a3bc02e4ec62b81311410a64277752ec8bbe382107d7de93eea3d702f97cd +size 54568 diff --git a/.lancedb/nltk_chunking.lance/data/f574513f-4754-4540-9967-13660a890b09.lance b/.lancedb/nltk_chunking.lance/data/f574513f-4754-4540-9967-13660a890b09.lance new file mode 100644 index 0000000000000000000000000000000000000000..3d53a1b0e903b3dd3913a6ac42bf0d23ed098edd --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/f574513f-4754-4540-9967-13660a890b09.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5cfaff47479e8cd4187e2496eb4e7ccbaf1728ad936ca751c5112e52026a3570 +size 53988 diff --git a/.lancedb/nltk_chunking.lance/data/f71f1622-ab2d-4294-bb64-54c583b7f2eb.lance b/.lancedb/nltk_chunking.lance/data/f71f1622-ab2d-4294-bb64-54c583b7f2eb.lance new file mode 100644 index 0000000000000000000000000000000000000000..bf5f6a226553d2891d5654b8fe2fd7df103b84c7 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/f71f1622-ab2d-4294-bb64-54c583b7f2eb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47766d739a78b56401f73dd9a32a31e5ee40db48b866be496a6a2d870c1be10d +size 55881 diff --git a/.lancedb/nltk_chunking.lance/data/f8f5905d-f772-4684-bef5-0987e6c57236.lance b/.lancedb/nltk_chunking.lance/data/f8f5905d-f772-4684-bef5-0987e6c57236.lance new file mode 100644 index 0000000000000000000000000000000000000000..6aad989001a0695dc9caa06a4fa5c7674d816931 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/f8f5905d-f772-4684-bef5-0987e6c57236.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28973a0c6fd541bdf850f8890f9d9f58bdc22c79fcba9bd38578f13348870ed3 +size 54764 diff --git a/.lancedb/nltk_chunking.lance/data/f8fbbccf-ac5a-4300-9860-8500248aabe1.lance b/.lancedb/nltk_chunking.lance/data/f8fbbccf-ac5a-4300-9860-8500248aabe1.lance new file mode 100644 index 0000000000000000000000000000000000000000..dfd419acd1daddef8e8f2a77ca6fe1f4d8474fa5 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/f8fbbccf-ac5a-4300-9860-8500248aabe1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec4a53848e1ed111e99aa8991eb329edcf1fe1c2b09ec6e2381c73719ff91a6e +size 55908 diff --git a/.lancedb/nltk_chunking.lance/data/f90be475-7794-47c4-b255-559108734717.lance b/.lancedb/nltk_chunking.lance/data/f90be475-7794-47c4-b255-559108734717.lance new file mode 100644 index 0000000000000000000000000000000000000000..83d2399cb76d602890a82f1d75d5be6dfef87408 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/f90be475-7794-47c4-b255-559108734717.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430828e099ae89db4c5bd74acf39e857e8ccbfd1280ba6aeb89868c83f428bbe +size 55841 diff --git a/.lancedb/nltk_chunking.lance/data/f93c3a59-6ed6-4317-8dc3-4b992bec22dd.lance b/.lancedb/nltk_chunking.lance/data/f93c3a59-6ed6-4317-8dc3-4b992bec22dd.lance new file mode 100644 index 0000000000000000000000000000000000000000..9d2ce870252ad6e9dee59a165607531603021e00 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/f93c3a59-6ed6-4317-8dc3-4b992bec22dd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:548cb5deff282f617739df109c0fc5a3cbf91f00ee4c15458788aa6af24e9593 +size 55831 diff --git a/.lancedb/nltk_chunking.lance/data/f9519d1a-3efa-4a97-8d70-024afaa37382.lance b/.lancedb/nltk_chunking.lance/data/f9519d1a-3efa-4a97-8d70-024afaa37382.lance new file mode 100644 index 0000000000000000000000000000000000000000..adde972ae2474227c56dcc91638f4dce5a3dfbee --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/f9519d1a-3efa-4a97-8d70-024afaa37382.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a03306679becbab3943f498d56171dd24afb3df87dd43c0d2f44d82c6ce1d0c +size 54355 diff --git a/.lancedb/nltk_chunking.lance/data/fa6af94d-7375-4e92-b69b-42cf6fd02edb.lance b/.lancedb/nltk_chunking.lance/data/fa6af94d-7375-4e92-b69b-42cf6fd02edb.lance new file mode 100644 index 0000000000000000000000000000000000000000..21a180e7a80733cdf3f187787d671526bbce5c09 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/fa6af94d-7375-4e92-b69b-42cf6fd02edb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2297a1c084447749da15e6eba6212fc731c4b92646d84531fc40f95a4eb9b27c +size 55653 diff --git a/.lancedb/nltk_chunking.lance/data/fabdbd83-0062-4e4c-a537-c3865fd89c4a.lance b/.lancedb/nltk_chunking.lance/data/fabdbd83-0062-4e4c-a537-c3865fd89c4a.lance new file mode 100644 index 0000000000000000000000000000000000000000..74e97f975df8ec056f63c117b9f89282defdf5c6 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/fabdbd83-0062-4e4c-a537-c3865fd89c4a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99402fa1b0e22bacf0ec2f0f853002d02d440579b7dfec4b1bf6cdac6c347678 +size 54595 diff --git a/.lancedb/nltk_chunking.lance/data/fae1126d-370d-4463-a9d9-c51a5408cfe9.lance b/.lancedb/nltk_chunking.lance/data/fae1126d-370d-4463-a9d9-c51a5408cfe9.lance new file mode 100644 index 0000000000000000000000000000000000000000..74d42d6c25316fc9fc12d544cfe6b2e3e47280b0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/fae1126d-370d-4463-a9d9-c51a5408cfe9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dad3cf376b4c1cec2217c8228632ba3fce3081cd73a7e05c4dee8de0cdba07b7 +size 54613 diff --git a/.lancedb/nltk_chunking.lance/data/fb4cffc9-8f01-4530-aab1-18998100882b.lance b/.lancedb/nltk_chunking.lance/data/fb4cffc9-8f01-4530-aab1-18998100882b.lance new file mode 100644 index 0000000000000000000000000000000000000000..5fc0e5c0a6f545f8b1d9437b18eb5baac7aefa09 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/fb4cffc9-8f01-4530-aab1-18998100882b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79227a14aa657f06dbe15a455d757ccda347b0529798f7ed546096e70c0a7805 +size 55033 diff --git a/.lancedb/nltk_chunking.lance/data/fb557ad1-b162-4005-a544-bee09e7fe74c.lance b/.lancedb/nltk_chunking.lance/data/fb557ad1-b162-4005-a544-bee09e7fe74c.lance new file mode 100644 index 0000000000000000000000000000000000000000..56a8ace6679a088f2dd9559e64b269f4827dadf0 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/fb557ad1-b162-4005-a544-bee09e7fe74c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb9bd1ee68fdc2a9b68a78946ebfbf65ccb3ab41ed75b630711d52b6448c09d2 +size 57121 diff --git a/.lancedb/nltk_chunking.lance/data/fc099161-8da5-44b3-8ce2-5ca2b892b40d.lance b/.lancedb/nltk_chunking.lance/data/fc099161-8da5-44b3-8ce2-5ca2b892b40d.lance new file mode 100644 index 0000000000000000000000000000000000000000..92cdcc5de6397eb886e4bd1fc13cb4076f07d328 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/fc099161-8da5-44b3-8ce2-5ca2b892b40d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aac682562d312443880826fb27cc1f4ab1a50354af400cf942ba81f9e7ffcbad +size 55859 diff --git a/.lancedb/nltk_chunking.lance/data/fe49dfc3-fd75-49a6-a8b3-c47b95a7f6f2.lance b/.lancedb/nltk_chunking.lance/data/fe49dfc3-fd75-49a6-a8b3-c47b95a7f6f2.lance new file mode 100644 index 0000000000000000000000000000000000000000..a99036898d800b95d2c69313094fadaafe869889 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/fe49dfc3-fd75-49a6-a8b3-c47b95a7f6f2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c4a99c9047bf6e2bae402d1193079028ef482f75c5828a7181800ce58db9f8c +size 57885 diff --git a/.lancedb/nltk_chunking.lance/data/fea4dbf9-8be4-4c01-9f29-a46d0456cb73.lance b/.lancedb/nltk_chunking.lance/data/fea4dbf9-8be4-4c01-9f29-a46d0456cb73.lance new file mode 100644 index 0000000000000000000000000000000000000000..448ea183f865f456df7317e0e930291c26b4f07d --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/fea4dbf9-8be4-4c01-9f29-a46d0456cb73.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00cbd7217f9455f69f2ca623e34a8bf6b3a6d16ee9719f00868431181114bfa9 +size 54450 diff --git a/.lancedb/nltk_chunking.lance/data/ffa1a28e-7ad0-48d1-be45-52666049129c.lance b/.lancedb/nltk_chunking.lance/data/ffa1a28e-7ad0-48d1-be45-52666049129c.lance new file mode 100644 index 0000000000000000000000000000000000000000..65a4f4d701147d2c556df1abfcfde14880107b70 --- /dev/null +++ b/.lancedb/nltk_chunking.lance/data/ffa1a28e-7ad0-48d1-be45-52666049129c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e40ed15eb3b5410b5732e0cb9f03d02a7b7126f810caa4b6f1affd6e98e99fa6 +size 57728 diff --git a/.lancedb/nltk_chunking_BAAI.lance/_indices/3cc071ef-176c-452c-ad13-03bcdb43df51/index.idx b/.lancedb/nltk_chunking_BAAI.lance/_indices/3cc071ef-176c-452c-ad13-03bcdb43df51/index.idx new file mode 100644 index 0000000000000000000000000000000000000000..9645fa880f07aadee2883e9aa98df64c1d9897c1 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/_indices/3cc071ef-176c-452c-ad13-03bcdb43df51/index.idx @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e00dc8952995c48b941bbc7e0aa518ae0a24258d85f95120b421ab32ef46449e +size 2399698 diff --git a/.lancedb/nltk_chunking_BAAI.lance/_latest.manifest b/.lancedb/nltk_chunking_BAAI.lance/_latest.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2a4e5ebd77b47a336ee2214c2d3daefb03aee059 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_latest.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/0-ccf1cfd1-020a-480e-a5ee-ba25978cc2fb.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/0-ccf1cfd1-020a-480e-a5ee-ba25978cc2fb.txn new file mode 100644 index 0000000000000000000000000000000000000000..a360e197b598e44f926223d4559e7057e6d11eef --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/_transactions/0-ccf1cfd1-020a-480e-a5ee-ba25978cc2fb.txn @@ -0,0 +1 @@ +$ccf1cfd1-020a-480e-a5ee-ba25978cc2fb²V3vector ÿÿÿÿÿÿÿÿÿ*fixed_size_list:float:102408text ÿÿÿÿÿÿÿÿÿ*string08 \ No newline at end of file diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/1-0498be63-2f12-4937-addb-099e024d2a54.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/1-0498be63-2f12-4937-addb-099e024d2a54.txn new file mode 100644 index 0000000000000000000000000000000000000000..435a0fa5dd48920061989339d9fc69e885b1359f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/1-0498be63-2f12-4937-addb-099e024d2a54.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/10-7cc468d9-dfe6-4fa8-a203-cf4b5287efe4.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/10-7cc468d9-dfe6-4fa8-a203-cf4b5287efe4.txn new file mode 100644 index 0000000000000000000000000000000000000000..5f6f721a49a05feed9ea3de2b35445c90153ef9d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/10-7cc468d9-dfe6-4fa8-a203-cf4b5287efe4.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/100-d0002197-4b61-46ae-b961-50d1c6e65463.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/100-d0002197-4b61-46ae-b961-50d1c6e65463.txn new file mode 100644 index 0000000000000000000000000000000000000000..a28d5179b1e541c3cdb4d67fcd47cda0a3c50c42 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/100-d0002197-4b61-46ae-b961-50d1c6e65463.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/101-90578af6-dde7-4ee1-afb4-f414f21fd4f1.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/101-90578af6-dde7-4ee1-afb4-f414f21fd4f1.txn new file mode 100644 index 0000000000000000000000000000000000000000..09c947d4d89fe7871b6f3662a595d76800118cbb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/101-90578af6-dde7-4ee1-afb4-f414f21fd4f1.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/102-6dc1e7e1-e05b-4263-8585-daa7a6f46635.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/102-6dc1e7e1-e05b-4263-8585-daa7a6f46635.txn new file mode 100644 index 0000000000000000000000000000000000000000..9843e8013395d75c529c727a7b5d823107df5537 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/102-6dc1e7e1-e05b-4263-8585-daa7a6f46635.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/103-4adf2ade-ebba-4f5e-a88a-9d8b6bc494fe.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/103-4adf2ade-ebba-4f5e-a88a-9d8b6bc494fe.txn new file mode 100644 index 0000000000000000000000000000000000000000..07362a2a2c11b00419e5f7f08c4ffb45cf7a284b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/103-4adf2ade-ebba-4f5e-a88a-9d8b6bc494fe.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/104-4e746412-3f29-42fd-b0a7-3303a0d413c6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/104-4e746412-3f29-42fd-b0a7-3303a0d413c6.txn new file mode 100644 index 0000000000000000000000000000000000000000..9437ff4d474f7dbe2e7af64b4770adeb14ba574b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/104-4e746412-3f29-42fd-b0a7-3303a0d413c6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/105-6f85b623-75a4-4ba3-92f7-c488b78dd397.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/105-6f85b623-75a4-4ba3-92f7-c488b78dd397.txn new file mode 100644 index 0000000000000000000000000000000000000000..1ad1ee24011f19951d25bc7251492bfd485ac024 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/105-6f85b623-75a4-4ba3-92f7-c488b78dd397.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/106-18ea0f0b-4030-448c-af66-411b2e0c0852.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/106-18ea0f0b-4030-448c-af66-411b2e0c0852.txn new file mode 100644 index 0000000000000000000000000000000000000000..8acc4ed6591900bbb3bc3f5b88e48d924018c28b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/106-18ea0f0b-4030-448c-af66-411b2e0c0852.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/107-45a4079d-6ab4-4247-8a1b-7ec38443605c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/107-45a4079d-6ab4-4247-8a1b-7ec38443605c.txn new file mode 100644 index 0000000000000000000000000000000000000000..f0e52640ba8b417e0ea92cef458085666b1a802b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/107-45a4079d-6ab4-4247-8a1b-7ec38443605c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/108-e6d2a177-01e6-482c-a6df-9a0b7127bb9a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/108-e6d2a177-01e6-482c-a6df-9a0b7127bb9a.txn new file mode 100644 index 0000000000000000000000000000000000000000..50e59cce56953aa952d69700a97c3fbb74121f41 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/108-e6d2a177-01e6-482c-a6df-9a0b7127bb9a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/109-6788a67d-fa80-413e-a278-d0c67710338e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/109-6788a67d-fa80-413e-a278-d0c67710338e.txn new file mode 100644 index 0000000000000000000000000000000000000000..fa1fca4e419285f3e06ee0c1788213e919ea607e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/109-6788a67d-fa80-413e-a278-d0c67710338e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/11-99037161-14cf-4119-8599-bd0954a8efb1.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/11-99037161-14cf-4119-8599-bd0954a8efb1.txn new file mode 100644 index 0000000000000000000000000000000000000000..0a93dc1e36c2059ed27b6e1c085fd63c28c128aa Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/11-99037161-14cf-4119-8599-bd0954a8efb1.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/110-d47f7e50-c127-4f64-a85c-86510730187c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/110-d47f7e50-c127-4f64-a85c-86510730187c.txn new file mode 100644 index 0000000000000000000000000000000000000000..fa528440388fc4313251675cb5acd0beff1c9767 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/110-d47f7e50-c127-4f64-a85c-86510730187c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/111-18ba30fa-aa8d-4282-84e5-ddec43afefb3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/111-18ba30fa-aa8d-4282-84e5-ddec43afefb3.txn new file mode 100644 index 0000000000000000000000000000000000000000..df28da42c8b96f9d6641b7f05c0d0fb6bdc31cb1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/111-18ba30fa-aa8d-4282-84e5-ddec43afefb3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/112-12cdaf30-8777-4d87-b231-fc9645be791c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/112-12cdaf30-8777-4d87-b231-fc9645be791c.txn new file mode 100644 index 0000000000000000000000000000000000000000..12abe5da4acbde986daff09a31ad52db90287638 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/112-12cdaf30-8777-4d87-b231-fc9645be791c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/113-29600927-3662-49ae-a360-9b915fe3076c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/113-29600927-3662-49ae-a360-9b915fe3076c.txn new file mode 100644 index 0000000000000000000000000000000000000000..a67fdca86bc13d6a2aca67bca8944c90425054bd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/113-29600927-3662-49ae-a360-9b915fe3076c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/114-9c0e3716-0eee-42a1-a201-0998c084f958.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/114-9c0e3716-0eee-42a1-a201-0998c084f958.txn new file mode 100644 index 0000000000000000000000000000000000000000..de301b8384c0dfd836336e1d9a84b60c527798fe Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/114-9c0e3716-0eee-42a1-a201-0998c084f958.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/115-e175dba1-db19-46f2-9307-a1ea2680596b.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/115-e175dba1-db19-46f2-9307-a1ea2680596b.txn new file mode 100644 index 0000000000000000000000000000000000000000..4eb1070dae13310656626808ebe403ffbc8f33ff Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/115-e175dba1-db19-46f2-9307-a1ea2680596b.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/116-63c657fa-d69c-4ad3-a5e4-d1fd39ec5d8f.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/116-63c657fa-d69c-4ad3-a5e4-d1fd39ec5d8f.txn new file mode 100644 index 0000000000000000000000000000000000000000..ec13e53ec5848ebc335f6d222ffd7a77b61ec425 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/116-63c657fa-d69c-4ad3-a5e4-d1fd39ec5d8f.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/117-96140760-6d01-4ed8-a142-b7a6eef01d2e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/117-96140760-6d01-4ed8-a142-b7a6eef01d2e.txn new file mode 100644 index 0000000000000000000000000000000000000000..5e93331e597e3773f355b6fc737a8ac407d42950 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/117-96140760-6d01-4ed8-a142-b7a6eef01d2e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/118-7982331b-b2dd-4735-bad1-4303532389cf.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/118-7982331b-b2dd-4735-bad1-4303532389cf.txn new file mode 100644 index 0000000000000000000000000000000000000000..483a507b8887635bf1807e18a7a266263efb539a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/118-7982331b-b2dd-4735-bad1-4303532389cf.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/119-9efd69df-7f10-4ab7-b425-43238aa45649.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/119-9efd69df-7f10-4ab7-b425-43238aa45649.txn new file mode 100644 index 0000000000000000000000000000000000000000..0dc3812ffe663a04e8b70ab1bd6d8fd4a39a2fcc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/119-9efd69df-7f10-4ab7-b425-43238aa45649.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/12-34587498-082b-4cfe-9c5b-92017b6c0c6e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/12-34587498-082b-4cfe-9c5b-92017b6c0c6e.txn new file mode 100644 index 0000000000000000000000000000000000000000..e18339bf8fcf54890a8eb92bd0ab856d7930cbb6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/12-34587498-082b-4cfe-9c5b-92017b6c0c6e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/120-b3952bef-953f-4828-b60a-a1d8fd2052ff.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/120-b3952bef-953f-4828-b60a-a1d8fd2052ff.txn new file mode 100644 index 0000000000000000000000000000000000000000..e74795e30e30a4ac012f131d0a83cd3e7982670e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/120-b3952bef-953f-4828-b60a-a1d8fd2052ff.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/121-6d6c6ce4-06c9-4c6f-8f7e-94af7b5ed32f.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/121-6d6c6ce4-06c9-4c6f-8f7e-94af7b5ed32f.txn new file mode 100644 index 0000000000000000000000000000000000000000..7df4fb06fbab37cf2948b0737c7ee48f067e0b4c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/121-6d6c6ce4-06c9-4c6f-8f7e-94af7b5ed32f.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/122-d7dc2622-d438-4c54-9530-c776f9111112.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/122-d7dc2622-d438-4c54-9530-c776f9111112.txn new file mode 100644 index 0000000000000000000000000000000000000000..7f2b7e006eb7f69d942048a153254c0b91af5a09 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/122-d7dc2622-d438-4c54-9530-c776f9111112.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/123-ebd5ec1b-1cca-4a1a-a7e0-7daaf531ef32.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/123-ebd5ec1b-1cca-4a1a-a7e0-7daaf531ef32.txn new file mode 100644 index 0000000000000000000000000000000000000000..8681e5d24756e3a7ee76aaea366114fdd5274b77 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/123-ebd5ec1b-1cca-4a1a-a7e0-7daaf531ef32.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/124-1e51934e-6334-4695-b1b5-230b42764309.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/124-1e51934e-6334-4695-b1b5-230b42764309.txn new file mode 100644 index 0000000000000000000000000000000000000000..c221f08f8d09340692b8a4529a82c32872ff86e6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/124-1e51934e-6334-4695-b1b5-230b42764309.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/125-c7ead199-0ef4-4748-b389-c5883e84e40e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/125-c7ead199-0ef4-4748-b389-c5883e84e40e.txn new file mode 100644 index 0000000000000000000000000000000000000000..349bb8e469c57234388f807dc12a2f2c126f7f96 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/125-c7ead199-0ef4-4748-b389-c5883e84e40e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/126-de55a603-286d-461d-986d-875e55e712f3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/126-de55a603-286d-461d-986d-875e55e712f3.txn new file mode 100644 index 0000000000000000000000000000000000000000..1be234331948dff6efc8a42fec0838710e0d4ee4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/126-de55a603-286d-461d-986d-875e55e712f3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/127-8075fdcb-3cf4-40b0-b9c6-1465c2acc12c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/127-8075fdcb-3cf4-40b0-b9c6-1465c2acc12c.txn new file mode 100644 index 0000000000000000000000000000000000000000..697c87ed2ee443e7e58ae847b4c992cf37f5ac1c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/127-8075fdcb-3cf4-40b0-b9c6-1465c2acc12c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/128-0500ee7b-6360-4ce4-a877-58cb38d9e816.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/128-0500ee7b-6360-4ce4-a877-58cb38d9e816.txn new file mode 100644 index 0000000000000000000000000000000000000000..b9a34ff56691a885c144b927b68aafc9a34303f8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/128-0500ee7b-6360-4ce4-a877-58cb38d9e816.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/129-69f23870-dbad-4be6-b45d-b15ab69c7e88.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/129-69f23870-dbad-4be6-b45d-b15ab69c7e88.txn new file mode 100644 index 0000000000000000000000000000000000000000..e680bbf9f7e175538ae6700756265ae1877b1457 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/129-69f23870-dbad-4be6-b45d-b15ab69c7e88.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/13-0a21f58d-c4ad-478f-99bd-302109a5251d.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/13-0a21f58d-c4ad-478f-99bd-302109a5251d.txn new file mode 100644 index 0000000000000000000000000000000000000000..e3e7762da0e7ec791fa7cd610969da3404aa5522 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/13-0a21f58d-c4ad-478f-99bd-302109a5251d.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/130-6b7daef3-1811-4b71-96d3-83ee153ed833.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/130-6b7daef3-1811-4b71-96d3-83ee153ed833.txn new file mode 100644 index 0000000000000000000000000000000000000000..da974484deae6d1aea0a454f13070f1f396bdb4c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/130-6b7daef3-1811-4b71-96d3-83ee153ed833.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/131-15ea119b-165b-4dd4-8ce3-93f29e4f55d8.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/131-15ea119b-165b-4dd4-8ce3-93f29e4f55d8.txn new file mode 100644 index 0000000000000000000000000000000000000000..eafe33da000090514d608c3c76b8ff19a5cc7c9f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/131-15ea119b-165b-4dd4-8ce3-93f29e4f55d8.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/132-30ead30f-6c46-41e8-9fdf-24bbf9e7c230.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/132-30ead30f-6c46-41e8-9fdf-24bbf9e7c230.txn new file mode 100644 index 0000000000000000000000000000000000000000..5fcf770a222ff21f2657c642107574729c85bfa4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/132-30ead30f-6c46-41e8-9fdf-24bbf9e7c230.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/133-64484374-a091-4955-8359-a3f04a9b0d57.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/133-64484374-a091-4955-8359-a3f04a9b0d57.txn new file mode 100644 index 0000000000000000000000000000000000000000..579fdaf586867f6a09367bd106f2ac794c9219cc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/133-64484374-a091-4955-8359-a3f04a9b0d57.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/134-0f0c6531-ec28-4076-b927-2f0692faf685.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/134-0f0c6531-ec28-4076-b927-2f0692faf685.txn new file mode 100644 index 0000000000000000000000000000000000000000..a45e96d8326e9b03294a790fedfb8e2eada4d1d0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/134-0f0c6531-ec28-4076-b927-2f0692faf685.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/135-2e8fc695-65b9-4bdd-8dbc-97abe56a36ac.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/135-2e8fc695-65b9-4bdd-8dbc-97abe56a36ac.txn new file mode 100644 index 0000000000000000000000000000000000000000..194c0bb8f2be445e4b26ec17d64fdf294aac9548 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/135-2e8fc695-65b9-4bdd-8dbc-97abe56a36ac.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/136-6e9da7d0-fdf3-4167-89e8-c269e67becef.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/136-6e9da7d0-fdf3-4167-89e8-c269e67becef.txn new file mode 100644 index 0000000000000000000000000000000000000000..9b0bc6a290ea155aa9cb3e424fa0a9a4c8d1415f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/136-6e9da7d0-fdf3-4167-89e8-c269e67becef.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/137-d398e849-ac95-4fb5-a6d6-617ead188888.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/137-d398e849-ac95-4fb5-a6d6-617ead188888.txn new file mode 100644 index 0000000000000000000000000000000000000000..82b1d1bef963c7def7ba6d48d079b068f16afdab Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/137-d398e849-ac95-4fb5-a6d6-617ead188888.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/138-7686faf5-bdf7-436b-996c-9a807bcebc70.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/138-7686faf5-bdf7-436b-996c-9a807bcebc70.txn new file mode 100644 index 0000000000000000000000000000000000000000..ca3211aea314583042b9b38c704384ac6636adcd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/138-7686faf5-bdf7-436b-996c-9a807bcebc70.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/139-61408008-314d-48ef-9a13-785bc2024304.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/139-61408008-314d-48ef-9a13-785bc2024304.txn new file mode 100644 index 0000000000000000000000000000000000000000..b3bce1ba040442914649854bdbf3e9508e7ff688 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/139-61408008-314d-48ef-9a13-785bc2024304.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/14-e22367dd-925c-469c-b406-88d5382d8392.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/14-e22367dd-925c-469c-b406-88d5382d8392.txn new file mode 100644 index 0000000000000000000000000000000000000000..d4e0f156c9d8ba2b922a173422b2a9dc11eb2c73 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/14-e22367dd-925c-469c-b406-88d5382d8392.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/140-dc940332-b75c-4e14-b89a-de183931aa98.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/140-dc940332-b75c-4e14-b89a-de183931aa98.txn new file mode 100644 index 0000000000000000000000000000000000000000..829812bca6caa08a4cec2c8cfd59eb19c610c7cb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/140-dc940332-b75c-4e14-b89a-de183931aa98.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/141-38a09f94-8d75-498b-8b55-339488c3d2c3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/141-38a09f94-8d75-498b-8b55-339488c3d2c3.txn new file mode 100644 index 0000000000000000000000000000000000000000..432adcad9ac78aed64d4523acf8e7d01159e8d53 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/141-38a09f94-8d75-498b-8b55-339488c3d2c3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/142-08c0bc60-80f8-48b7-beac-1ab81aa0470a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/142-08c0bc60-80f8-48b7-beac-1ab81aa0470a.txn new file mode 100644 index 0000000000000000000000000000000000000000..e0695ae2de17a1454d39cff893ed3937da098811 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/142-08c0bc60-80f8-48b7-beac-1ab81aa0470a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/143-3b77e351-c9ca-468a-81ce-830ce98b8dab.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/143-3b77e351-c9ca-468a-81ce-830ce98b8dab.txn new file mode 100644 index 0000000000000000000000000000000000000000..b099b801aecb971f1ff07558c28f9c532ecce62e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/143-3b77e351-c9ca-468a-81ce-830ce98b8dab.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/144-188d7159-b111-45bb-8e90-94dcfa9f2ad3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/144-188d7159-b111-45bb-8e90-94dcfa9f2ad3.txn new file mode 100644 index 0000000000000000000000000000000000000000..26eec97380f9d4ce7c9dba302822a915eb15de34 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/144-188d7159-b111-45bb-8e90-94dcfa9f2ad3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/145-a6e84838-50e6-4e72-b89b-bceb3afa29e4.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/145-a6e84838-50e6-4e72-b89b-bceb3afa29e4.txn new file mode 100644 index 0000000000000000000000000000000000000000..8bb34041965772f71e35a5c467ef7fd1cb31a016 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/145-a6e84838-50e6-4e72-b89b-bceb3afa29e4.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/146-841714c9-88b9-4d47-9458-f34b0ee1e307.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/146-841714c9-88b9-4d47-9458-f34b0ee1e307.txn new file mode 100644 index 0000000000000000000000000000000000000000..d9431d50effb5e36f6f67ad479c4b7ec40255d78 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/146-841714c9-88b9-4d47-9458-f34b0ee1e307.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/147-dc8c6c8c-e3ec-45d7-9d1d-f258bb805c03.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/147-dc8c6c8c-e3ec-45d7-9d1d-f258bb805c03.txn new file mode 100644 index 0000000000000000000000000000000000000000..39285695f22bb21f172e6c4253ab03e4f217014a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/147-dc8c6c8c-e3ec-45d7-9d1d-f258bb805c03.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/148-a9409129-e8a7-4b84-937b-d1742ac69841.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/148-a9409129-e8a7-4b84-937b-d1742ac69841.txn new file mode 100644 index 0000000000000000000000000000000000000000..f7e13a8d6afc20085e32063caac3452af25524f7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/148-a9409129-e8a7-4b84-937b-d1742ac69841.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/149-cb669579-7cee-4615-8260-a7149d34bdc0.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/149-cb669579-7cee-4615-8260-a7149d34bdc0.txn new file mode 100644 index 0000000000000000000000000000000000000000..3352b5ab5207a04de8b3a09e63177f6e1d3d81df Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/149-cb669579-7cee-4615-8260-a7149d34bdc0.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/15-9cb6eecd-2014-4328-82e5-d2e886f6fd5e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/15-9cb6eecd-2014-4328-82e5-d2e886f6fd5e.txn new file mode 100644 index 0000000000000000000000000000000000000000..1b3ce98dcc993a7d5811f6d5f0fd3df1dcf229f5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/15-9cb6eecd-2014-4328-82e5-d2e886f6fd5e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/150-36dfc55c-468b-40e6-be69-d3314616f1ee.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/150-36dfc55c-468b-40e6-be69-d3314616f1ee.txn new file mode 100644 index 0000000000000000000000000000000000000000..59d515c99d01fc9a19eff4df4eb6802c1b04dcdf Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/150-36dfc55c-468b-40e6-be69-d3314616f1ee.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/151-d95da7ed-89b2-4fc8-83e5-e5d43b5fad63.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/151-d95da7ed-89b2-4fc8-83e5-e5d43b5fad63.txn new file mode 100644 index 0000000000000000000000000000000000000000..fb374c707125831eabd41514b2fc9dcbc600967d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/151-d95da7ed-89b2-4fc8-83e5-e5d43b5fad63.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/152-c5bac197-e131-4b08-a4df-e2ececed5bca.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/152-c5bac197-e131-4b08-a4df-e2ececed5bca.txn new file mode 100644 index 0000000000000000000000000000000000000000..950079bae2018dbc9ecfac66f40a8cbfcfb03740 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/152-c5bac197-e131-4b08-a4df-e2ececed5bca.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/153-3c23f211-2ca2-4e3b-864c-996d70ba181e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/153-3c23f211-2ca2-4e3b-864c-996d70ba181e.txn new file mode 100644 index 0000000000000000000000000000000000000000..910466d861e2f3a8d9a5b23ca7c41c66103a2965 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/153-3c23f211-2ca2-4e3b-864c-996d70ba181e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/154-e5fb7d98-9206-4a85-961c-856a3648d3e9.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/154-e5fb7d98-9206-4a85-961c-856a3648d3e9.txn new file mode 100644 index 0000000000000000000000000000000000000000..e1bda44c4aa146fed835184c717a8a2c0555c3dc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/154-e5fb7d98-9206-4a85-961c-856a3648d3e9.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/155-2bb7b122-d2b3-42c9-a2e3-59e23f0ad881.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/155-2bb7b122-d2b3-42c9-a2e3-59e23f0ad881.txn new file mode 100644 index 0000000000000000000000000000000000000000..2a5031d9b42a56c802ce6ad0e9ffa817b4ba79fb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/155-2bb7b122-d2b3-42c9-a2e3-59e23f0ad881.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/156-c8a2035e-bac3-4cf0-86be-8a9f4be7ea72.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/156-c8a2035e-bac3-4cf0-86be-8a9f4be7ea72.txn new file mode 100644 index 0000000000000000000000000000000000000000..399396a633666e7d8693b249c7cf3cea6d1c7e6d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/156-c8a2035e-bac3-4cf0-86be-8a9f4be7ea72.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/157-bc742045-d423-4855-aecd-492a20b257e0.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/157-bc742045-d423-4855-aecd-492a20b257e0.txn new file mode 100644 index 0000000000000000000000000000000000000000..a790d21df5be40abe6001708a685e524ac646f98 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/157-bc742045-d423-4855-aecd-492a20b257e0.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/158-6a60fcc0-1a52-4609-a022-fe56f36c062a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/158-6a60fcc0-1a52-4609-a022-fe56f36c062a.txn new file mode 100644 index 0000000000000000000000000000000000000000..f10486f9bcf81202a6be3f3c66f4c4b78c90ad08 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/158-6a60fcc0-1a52-4609-a022-fe56f36c062a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/159-aefbb3fc-b05d-41a6-aac9-ae05bcd04e04.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/159-aefbb3fc-b05d-41a6-aac9-ae05bcd04e04.txn new file mode 100644 index 0000000000000000000000000000000000000000..aa65d30d7587a37d7f2f6d92dd0a7821d50adbaa Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/159-aefbb3fc-b05d-41a6-aac9-ae05bcd04e04.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/16-1bafc0ee-22fd-4d47-838f-8feb31e671f2.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/16-1bafc0ee-22fd-4d47-838f-8feb31e671f2.txn new file mode 100644 index 0000000000000000000000000000000000000000..864a98fa4163a1734b1f7b9fe8130fc05a98ee81 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/16-1bafc0ee-22fd-4d47-838f-8feb31e671f2.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/160-5135a77b-8e0e-4896-9768-ec4b7ee3015e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/160-5135a77b-8e0e-4896-9768-ec4b7ee3015e.txn new file mode 100644 index 0000000000000000000000000000000000000000..6e66d540853a36820c796e80cf4dcc3915364171 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/160-5135a77b-8e0e-4896-9768-ec4b7ee3015e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/161-d9b9604b-34d6-4d76-94c1-99098b05d199.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/161-d9b9604b-34d6-4d76-94c1-99098b05d199.txn new file mode 100644 index 0000000000000000000000000000000000000000..c340a6904792dbd9f3c4cd1b150a6ffe21e7fd76 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/161-d9b9604b-34d6-4d76-94c1-99098b05d199.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/162-ac4edad2-5441-454c-8969-ea7ac9e9441a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/162-ac4edad2-5441-454c-8969-ea7ac9e9441a.txn new file mode 100644 index 0000000000000000000000000000000000000000..f858fdeb7fa99b97187d1fadc898d3a57a6ccaaa Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/162-ac4edad2-5441-454c-8969-ea7ac9e9441a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/163-d72a5d04-b8d9-41d7-a22f-88842fab77b4.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/163-d72a5d04-b8d9-41d7-a22f-88842fab77b4.txn new file mode 100644 index 0000000000000000000000000000000000000000..ac996e829aa9ce2ad2c25eef696a33d7e9ef3282 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/163-d72a5d04-b8d9-41d7-a22f-88842fab77b4.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/164-be891a99-05a1-48c7-948b-5cd54fd09158.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/164-be891a99-05a1-48c7-948b-5cd54fd09158.txn new file mode 100644 index 0000000000000000000000000000000000000000..81934f94d190d61f2d95da10bf87a8ddb7334466 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/164-be891a99-05a1-48c7-948b-5cd54fd09158.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/165-a8f16991-b4a1-4d3b-a441-2f7e96420326.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/165-a8f16991-b4a1-4d3b-a441-2f7e96420326.txn new file mode 100644 index 0000000000000000000000000000000000000000..8b3b78e3dc54ca911df1b1bbe5109cb86f9c0156 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/165-a8f16991-b4a1-4d3b-a441-2f7e96420326.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/166-f5856857-9100-48b0-9278-b49940f6a130.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/166-f5856857-9100-48b0-9278-b49940f6a130.txn new file mode 100644 index 0000000000000000000000000000000000000000..b8b62377e273f7822fffcf7c7db3114233caf19f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/166-f5856857-9100-48b0-9278-b49940f6a130.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/167-3bd928f9-66f7-442d-8177-ac4bbc557c64.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/167-3bd928f9-66f7-442d-8177-ac4bbc557c64.txn new file mode 100644 index 0000000000000000000000000000000000000000..7a086c94a4022632245b226dfe1400e0919bf1a5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/167-3bd928f9-66f7-442d-8177-ac4bbc557c64.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/168-0be373e2-eddb-41a3-8e80-39ab329ad7eb.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/168-0be373e2-eddb-41a3-8e80-39ab329ad7eb.txn new file mode 100644 index 0000000000000000000000000000000000000000..ce3bb27ffd2596749781140997e0e62af2e97839 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/168-0be373e2-eddb-41a3-8e80-39ab329ad7eb.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/169-5c212a1d-798c-4f30-9f13-5ef6ba082196.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/169-5c212a1d-798c-4f30-9f13-5ef6ba082196.txn new file mode 100644 index 0000000000000000000000000000000000000000..2b17b6d528f2654c099e62161b1b90201c8ff6a0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/169-5c212a1d-798c-4f30-9f13-5ef6ba082196.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/17-9c51d7e5-f802-4fce-a2be-0862f01b8757.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/17-9c51d7e5-f802-4fce-a2be-0862f01b8757.txn new file mode 100644 index 0000000000000000000000000000000000000000..abef3f5aeb4f2811a18806e5b0f4f0069ecf5db2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/17-9c51d7e5-f802-4fce-a2be-0862f01b8757.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/170-2507f3d1-ce3f-4b15-89a6-91db03073db2.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/170-2507f3d1-ce3f-4b15-89a6-91db03073db2.txn new file mode 100644 index 0000000000000000000000000000000000000000..d710f7551268d64c93a4b7a109eaa5a40173ac9a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/170-2507f3d1-ce3f-4b15-89a6-91db03073db2.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/171-4f5db188-a414-4430-bbbc-3ab35209ccf1.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/171-4f5db188-a414-4430-bbbc-3ab35209ccf1.txn new file mode 100644 index 0000000000000000000000000000000000000000..07e63a2b550b84938654f81661e32ea9f9405bf9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/171-4f5db188-a414-4430-bbbc-3ab35209ccf1.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/172-df97085a-232a-4404-abfc-a5bd84900241.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/172-df97085a-232a-4404-abfc-a5bd84900241.txn new file mode 100644 index 0000000000000000000000000000000000000000..07484ca693f46d193d6363e6422089531369c116 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/172-df97085a-232a-4404-abfc-a5bd84900241.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/173-73de5975-a331-45e7-b010-0f62589d0b80.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/173-73de5975-a331-45e7-b010-0f62589d0b80.txn new file mode 100644 index 0000000000000000000000000000000000000000..68ef010a0967459c316b1dc539dc56f66844eafa Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/173-73de5975-a331-45e7-b010-0f62589d0b80.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/174-c46ebc65-616e-42f1-92ca-f8a5a4ce52b7.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/174-c46ebc65-616e-42f1-92ca-f8a5a4ce52b7.txn new file mode 100644 index 0000000000000000000000000000000000000000..d5a8e99c53ddb1b9def59c01cd17c2e40a245f46 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/174-c46ebc65-616e-42f1-92ca-f8a5a4ce52b7.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/175-45e623a6-28cc-4651-815e-63214cc073e8.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/175-45e623a6-28cc-4651-815e-63214cc073e8.txn new file mode 100644 index 0000000000000000000000000000000000000000..ceb47a3418683698f34337fb3dcc02d51a5fd63d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/175-45e623a6-28cc-4651-815e-63214cc073e8.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/176-ef95b9b6-30a7-4d16-a8cd-043d78b1b659.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/176-ef95b9b6-30a7-4d16-a8cd-043d78b1b659.txn new file mode 100644 index 0000000000000000000000000000000000000000..16d714c9e3adf526e4e071c098a5f768bcec22ac Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/176-ef95b9b6-30a7-4d16-a8cd-043d78b1b659.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/177-787271e2-c86c-4a26-8392-90f266474c36.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/177-787271e2-c86c-4a26-8392-90f266474c36.txn new file mode 100644 index 0000000000000000000000000000000000000000..c1c6f8a33c5d67a77cbff020933ee1df35dec9fb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/177-787271e2-c86c-4a26-8392-90f266474c36.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/178-9ce2ec91-5032-44a8-aafc-80ade877f7f2.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/178-9ce2ec91-5032-44a8-aafc-80ade877f7f2.txn new file mode 100644 index 0000000000000000000000000000000000000000..485aeb85560e6a80b9a89f437acf9051845e95e5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/178-9ce2ec91-5032-44a8-aafc-80ade877f7f2.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/179-0462717f-9894-4527-a76c-42c1c6861406.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/179-0462717f-9894-4527-a76c-42c1c6861406.txn new file mode 100644 index 0000000000000000000000000000000000000000..e48e7cce9e5deb03416fdb6f337dc38a15a8d47a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/179-0462717f-9894-4527-a76c-42c1c6861406.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/18-ec097609-0e83-459f-82b8-99d48236a7a2.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/18-ec097609-0e83-459f-82b8-99d48236a7a2.txn new file mode 100644 index 0000000000000000000000000000000000000000..2a2b398093a68a6b26eebd2d8ff6bbbe1cc324e3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/18-ec097609-0e83-459f-82b8-99d48236a7a2.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/180-d295b92a-f4a0-4a03-ac06-92660b417ef8.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/180-d295b92a-f4a0-4a03-ac06-92660b417ef8.txn new file mode 100644 index 0000000000000000000000000000000000000000..5eab3381c27b77fac58b106453d0e74f21aaf897 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/180-d295b92a-f4a0-4a03-ac06-92660b417ef8.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/181-c9e884cd-68bf-4d24-9367-22be848ee18b.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/181-c9e884cd-68bf-4d24-9367-22be848ee18b.txn new file mode 100644 index 0000000000000000000000000000000000000000..ddbd45c231b20bef2649aa5aed733c8ac7740397 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/181-c9e884cd-68bf-4d24-9367-22be848ee18b.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/182-ff145b50-ee1d-4b89-9f84-d62ab7a9fe07.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/182-ff145b50-ee1d-4b89-9f84-d62ab7a9fe07.txn new file mode 100644 index 0000000000000000000000000000000000000000..0275352e21688a7b0fed19c7b924a28cc89629f2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/182-ff145b50-ee1d-4b89-9f84-d62ab7a9fe07.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/183-d435e1d8-feed-490d-a77c-2c6d1fd332e3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/183-d435e1d8-feed-490d-a77c-2c6d1fd332e3.txn new file mode 100644 index 0000000000000000000000000000000000000000..a21a09be2bf905811880e16c830d34009f79cd01 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/183-d435e1d8-feed-490d-a77c-2c6d1fd332e3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/184-a3fc9c5e-e8c7-4747-afd2-801b8e7dcf43.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/184-a3fc9c5e-e8c7-4747-afd2-801b8e7dcf43.txn new file mode 100644 index 0000000000000000000000000000000000000000..0fa14cbaab6e09488124740ec637f442813de97c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/184-a3fc9c5e-e8c7-4747-afd2-801b8e7dcf43.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/185-cc9882af-da23-4e6e-92e4-3ebed2633369.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/185-cc9882af-da23-4e6e-92e4-3ebed2633369.txn new file mode 100644 index 0000000000000000000000000000000000000000..1ee02af6485d4ad34353f7226a3942dc5957753b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/185-cc9882af-da23-4e6e-92e4-3ebed2633369.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/186-5cced23b-fa0f-45a2-b802-c74f0ae8ff02.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/186-5cced23b-fa0f-45a2-b802-c74f0ae8ff02.txn new file mode 100644 index 0000000000000000000000000000000000000000..5ef40ba5c3a996ae92fd0464b0979a28b09426c1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/186-5cced23b-fa0f-45a2-b802-c74f0ae8ff02.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/187-dd16edfe-66d7-4d7b-895e-4bde8d6a837b.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/187-dd16edfe-66d7-4d7b-895e-4bde8d6a837b.txn new file mode 100644 index 0000000000000000000000000000000000000000..2bc39ad6ac32e40e19a395ff8e9cbe7a97ac287b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/187-dd16edfe-66d7-4d7b-895e-4bde8d6a837b.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/188-01c334fd-267d-4f0d-b057-0e2f4b738b45.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/188-01c334fd-267d-4f0d-b057-0e2f4b738b45.txn new file mode 100644 index 0000000000000000000000000000000000000000..0ea545fd02b592f67313a0f9561d893db376b9bc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/188-01c334fd-267d-4f0d-b057-0e2f4b738b45.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/189-4c139113-5670-4b24-8edd-bbeea05cca95.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/189-4c139113-5670-4b24-8edd-bbeea05cca95.txn new file mode 100644 index 0000000000000000000000000000000000000000..aeb76f700f24eabe287daf2e7cb7ae52f1e28a3b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/189-4c139113-5670-4b24-8edd-bbeea05cca95.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/19-66c812d9-994c-42a0-8e90-5a944d0183e1.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/19-66c812d9-994c-42a0-8e90-5a944d0183e1.txn new file mode 100644 index 0000000000000000000000000000000000000000..4dd25d87187233ffbdb3a1bee06283d014d766fc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/19-66c812d9-994c-42a0-8e90-5a944d0183e1.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/190-8acc7e48-5d47-493b-810d-476d467d7197.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/190-8acc7e48-5d47-493b-810d-476d467d7197.txn new file mode 100644 index 0000000000000000000000000000000000000000..5544ee408aaa45ef3769d1c6247d576831afb3cd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/190-8acc7e48-5d47-493b-810d-476d467d7197.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/191-62e89c88-f8cb-43a2-b975-fe7155e07b67.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/191-62e89c88-f8cb-43a2-b975-fe7155e07b67.txn new file mode 100644 index 0000000000000000000000000000000000000000..8f970a495f9f174924e5dd1e929d04258eb29820 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/191-62e89c88-f8cb-43a2-b975-fe7155e07b67.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/192-6adb3f26-362f-4772-99d5-f7d080a2c6dd.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/192-6adb3f26-362f-4772-99d5-f7d080a2c6dd.txn new file mode 100644 index 0000000000000000000000000000000000000000..9b49e6b080977f45b36843e95fc3dfdde7aa3f8b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/192-6adb3f26-362f-4772-99d5-f7d080a2c6dd.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/193-2125b227-9c4c-4242-9414-90b01efe11e8.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/193-2125b227-9c4c-4242-9414-90b01efe11e8.txn new file mode 100644 index 0000000000000000000000000000000000000000..cbd3da8ee3185e880913f7e291f64669ec94b9e7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/193-2125b227-9c4c-4242-9414-90b01efe11e8.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/194-cad76999-c018-4e10-a055-fab763c03599.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/194-cad76999-c018-4e10-a055-fab763c03599.txn new file mode 100644 index 0000000000000000000000000000000000000000..819f88a736d312c6826d31de979713aea99cc465 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/194-cad76999-c018-4e10-a055-fab763c03599.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/195-a56916fa-bd27-4a3c-a508-875a2276c808.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/195-a56916fa-bd27-4a3c-a508-875a2276c808.txn new file mode 100644 index 0000000000000000000000000000000000000000..8abccf0af52d9164fce874a1d548e05ffa6ea714 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/195-a56916fa-bd27-4a3c-a508-875a2276c808.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/196-e1bcb4cb-6525-41b5-af19-c69527165674.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/196-e1bcb4cb-6525-41b5-af19-c69527165674.txn new file mode 100644 index 0000000000000000000000000000000000000000..30afde886c8c578b7876af9bbed5a163a1033e4b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/196-e1bcb4cb-6525-41b5-af19-c69527165674.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/197-9a66a3ff-bc12-41c1-aef4-5e3575a44004.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/197-9a66a3ff-bc12-41c1-aef4-5e3575a44004.txn new file mode 100644 index 0000000000000000000000000000000000000000..f7647c4c9830b831ca715c89e5d29512059dac65 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/197-9a66a3ff-bc12-41c1-aef4-5e3575a44004.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/198-c077e1f8-663c-44a7-9a8f-723c6c3791b2.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/198-c077e1f8-663c-44a7-9a8f-723c6c3791b2.txn new file mode 100644 index 0000000000000000000000000000000000000000..031852ce8f1812f9b883c86d365dc48332791cf0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/198-c077e1f8-663c-44a7-9a8f-723c6c3791b2.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/199-8d7a65ba-fb94-4cd5-a50f-2ff20a2ae17f.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/199-8d7a65ba-fb94-4cd5-a50f-2ff20a2ae17f.txn new file mode 100644 index 0000000000000000000000000000000000000000..29270fa678d1743aca04c1580d7917787cec764e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/199-8d7a65ba-fb94-4cd5-a50f-2ff20a2ae17f.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/2-1859fb5c-136e-4f72-a966-1d245a23c738.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/2-1859fb5c-136e-4f72-a966-1d245a23c738.txn new file mode 100644 index 0000000000000000000000000000000000000000..7a448b072cd1cd975c5d8fb667439d02f63e5976 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/2-1859fb5c-136e-4f72-a966-1d245a23c738.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/20-fc895578-b997-4f41-a9b2-da43be88b35a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/20-fc895578-b997-4f41-a9b2-da43be88b35a.txn new file mode 100644 index 0000000000000000000000000000000000000000..3393b4bbc77fa5f5a3941214c4ba7e17728a64e6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/20-fc895578-b997-4f41-a9b2-da43be88b35a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/200-b01a7dc0-7191-4803-b67b-6b4e0af2cc0a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/200-b01a7dc0-7191-4803-b67b-6b4e0af2cc0a.txn new file mode 100644 index 0000000000000000000000000000000000000000..3ba1bced95dc8877559cd54df491f6a3865a0399 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/200-b01a7dc0-7191-4803-b67b-6b4e0af2cc0a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/201-81b0d1ad-cdcb-46f5-9746-59a624353ed6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/201-81b0d1ad-cdcb-46f5-9746-59a624353ed6.txn new file mode 100644 index 0000000000000000000000000000000000000000..8954c3664a30a9a6b3b6ba5859246007b6291dc1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/201-81b0d1ad-cdcb-46f5-9746-59a624353ed6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/202-dd4cbf91-1e85-461f-8151-39501769c230.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/202-dd4cbf91-1e85-461f-8151-39501769c230.txn new file mode 100644 index 0000000000000000000000000000000000000000..91ecce9d8e85591edb67ecbbb339be3e3b625acf Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/202-dd4cbf91-1e85-461f-8151-39501769c230.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/203-7e113a22-558a-477e-914b-0c5056d4e151.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/203-7e113a22-558a-477e-914b-0c5056d4e151.txn new file mode 100644 index 0000000000000000000000000000000000000000..4481d11c8ee11fa89beca4558f925583ee2263c2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/203-7e113a22-558a-477e-914b-0c5056d4e151.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/204-a26459ae-ded0-4527-a6a6-114e6c457132.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/204-a26459ae-ded0-4527-a6a6-114e6c457132.txn new file mode 100644 index 0000000000000000000000000000000000000000..87a24d4a5696a82ccbf931e5dc23ffdfec71d664 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/204-a26459ae-ded0-4527-a6a6-114e6c457132.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/205-f2e83ddb-95d8-468e-888f-0acf2bd14396.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/205-f2e83ddb-95d8-468e-888f-0acf2bd14396.txn new file mode 100644 index 0000000000000000000000000000000000000000..5419e372cff516ac590e9d799777f5119236dad8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/205-f2e83ddb-95d8-468e-888f-0acf2bd14396.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/206-7d7817f0-3533-4c8b-a397-f375da2f5ed2.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/206-7d7817f0-3533-4c8b-a397-f375da2f5ed2.txn new file mode 100644 index 0000000000000000000000000000000000000000..fc4e274e555c1532c0f82b3dd4c760138008a4ed Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/206-7d7817f0-3533-4c8b-a397-f375da2f5ed2.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/207-767135ff-73a9-4194-8973-22550b0f5dc6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/207-767135ff-73a9-4194-8973-22550b0f5dc6.txn new file mode 100644 index 0000000000000000000000000000000000000000..1dae1d09c55489397df38812cfb9022b82cb678f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/207-767135ff-73a9-4194-8973-22550b0f5dc6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/208-fd6f5df1-16f3-48a3-8c43-463bff81205b.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/208-fd6f5df1-16f3-48a3-8c43-463bff81205b.txn new file mode 100644 index 0000000000000000000000000000000000000000..9589c1ab3ed18929ed534597915302b8fd520492 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/208-fd6f5df1-16f3-48a3-8c43-463bff81205b.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/209-79f17db7-d47b-4034-b964-4a286dc3ee71.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/209-79f17db7-d47b-4034-b964-4a286dc3ee71.txn new file mode 100644 index 0000000000000000000000000000000000000000..f4ea9938aede448dbf3b01d5d849c81d5caba2cf Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/209-79f17db7-d47b-4034-b964-4a286dc3ee71.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/21-d6d2f367-c837-46f4-ae5a-6400036d79bc.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/21-d6d2f367-c837-46f4-ae5a-6400036d79bc.txn new file mode 100644 index 0000000000000000000000000000000000000000..6e992fa80879339cb32222365210b942da33795b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/21-d6d2f367-c837-46f4-ae5a-6400036d79bc.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/210-8d552619-1ce5-4757-83de-04eafeacf8ee.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/210-8d552619-1ce5-4757-83de-04eafeacf8ee.txn new file mode 100644 index 0000000000000000000000000000000000000000..974fcfb62c9fd6df941b525cfc6083b78844af07 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/210-8d552619-1ce5-4757-83de-04eafeacf8ee.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/211-b5dc31b6-ac08-481b-94a0-f384aefe40f4.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/211-b5dc31b6-ac08-481b-94a0-f384aefe40f4.txn new file mode 100644 index 0000000000000000000000000000000000000000..e81c192c85b8246c3ea7418ad60153b685be6753 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/211-b5dc31b6-ac08-481b-94a0-f384aefe40f4.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/212-24a1f6cb-7fe4-4c9f-8e3c-4c7226981446.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/212-24a1f6cb-7fe4-4c9f-8e3c-4c7226981446.txn new file mode 100644 index 0000000000000000000000000000000000000000..070588d76d889b9b4ae14902fb341ce422320758 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/212-24a1f6cb-7fe4-4c9f-8e3c-4c7226981446.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/213-a16d6b61-2d70-4253-b582-340eae0aca52.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/213-a16d6b61-2d70-4253-b582-340eae0aca52.txn new file mode 100644 index 0000000000000000000000000000000000000000..8c595fd70401be00d4ade9dddeb4660d26f0a840 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/213-a16d6b61-2d70-4253-b582-340eae0aca52.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/214-49d99b3d-1147-4f78-8850-93661094d404.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/214-49d99b3d-1147-4f78-8850-93661094d404.txn new file mode 100644 index 0000000000000000000000000000000000000000..92fa6a1e452da1917c11b811aa4405108df546a2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/214-49d99b3d-1147-4f78-8850-93661094d404.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/215-d8998a1f-c169-4a45-99e2-c41ab76e63e6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/215-d8998a1f-c169-4a45-99e2-c41ab76e63e6.txn new file mode 100644 index 0000000000000000000000000000000000000000..0a8248f0c875d096e6ad36456469a4bafaa89f8e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/215-d8998a1f-c169-4a45-99e2-c41ab76e63e6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/216-5f322da8-9acc-4a5a-b5be-dbac667b3b7f.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/216-5f322da8-9acc-4a5a-b5be-dbac667b3b7f.txn new file mode 100644 index 0000000000000000000000000000000000000000..8a210acd1ed3971e9d7f59aed006d4e6396f3376 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/216-5f322da8-9acc-4a5a-b5be-dbac667b3b7f.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/217-5b2108c8-0b26-422c-b73d-d74f2d5c837e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/217-5b2108c8-0b26-422c-b73d-d74f2d5c837e.txn new file mode 100644 index 0000000000000000000000000000000000000000..1aa1d38b0fccc21820abb06a9937ce8cd9da8051 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/217-5b2108c8-0b26-422c-b73d-d74f2d5c837e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/218-8db532bf-6a31-4c6a-b503-8109a1075fbf.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/218-8db532bf-6a31-4c6a-b503-8109a1075fbf.txn new file mode 100644 index 0000000000000000000000000000000000000000..8550c0237562351dd2345270fc84d5ecac2a8700 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/218-8db532bf-6a31-4c6a-b503-8109a1075fbf.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/219-ec4c7d09-c6ea-43a9-a53b-1a00428bed87.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/219-ec4c7d09-c6ea-43a9-a53b-1a00428bed87.txn new file mode 100644 index 0000000000000000000000000000000000000000..3834bdb4ad3b2c1b0939b0f33a0ad7388a9e8d77 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/219-ec4c7d09-c6ea-43a9-a53b-1a00428bed87.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/22-3265d573-c408-4ee2-a681-db22617fd921.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/22-3265d573-c408-4ee2-a681-db22617fd921.txn new file mode 100644 index 0000000000000000000000000000000000000000..0ff2182be510aa9347f9c2b48179ca783ff91166 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/22-3265d573-c408-4ee2-a681-db22617fd921.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/220-290051fb-aa6f-40cd-93a8-2de428422e8e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/220-290051fb-aa6f-40cd-93a8-2de428422e8e.txn new file mode 100644 index 0000000000000000000000000000000000000000..6808c580da55630d593432b42e36a707189bba62 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/220-290051fb-aa6f-40cd-93a8-2de428422e8e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/221-94b28907-a6e3-4ca7-9aed-1aa0070d690a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/221-94b28907-a6e3-4ca7-9aed-1aa0070d690a.txn new file mode 100644 index 0000000000000000000000000000000000000000..c8e45b647135fed8be520c6fb2be63a3e45a4944 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/221-94b28907-a6e3-4ca7-9aed-1aa0070d690a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/222-283d01ee-d989-4eff-83db-ee8d61a32868.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/222-283d01ee-d989-4eff-83db-ee8d61a32868.txn new file mode 100644 index 0000000000000000000000000000000000000000..07935e6f30d3215401674bb05d35195a3fbdfff1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/222-283d01ee-d989-4eff-83db-ee8d61a32868.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/223-d74a4720-fb1c-45f8-bb42-092f846f9b0d.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/223-d74a4720-fb1c-45f8-bb42-092f846f9b0d.txn new file mode 100644 index 0000000000000000000000000000000000000000..71b47198954002587f20e6a1b772e0251b7fbd8a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/223-d74a4720-fb1c-45f8-bb42-092f846f9b0d.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/224-f3b5e9f4-0b5d-4d57-b34e-efdc2d4f0c84.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/224-f3b5e9f4-0b5d-4d57-b34e-efdc2d4f0c84.txn new file mode 100644 index 0000000000000000000000000000000000000000..f31072c134845d3ddcc12178978d726b782b54bb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/224-f3b5e9f4-0b5d-4d57-b34e-efdc2d4f0c84.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/225-1d94cae2-bd8e-41be-bf1a-c14a44be2444.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/225-1d94cae2-bd8e-41be-bf1a-c14a44be2444.txn new file mode 100644 index 0000000000000000000000000000000000000000..23c875d1fe8770ae875e6d0b1dd24a67ed89ed68 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/225-1d94cae2-bd8e-41be-bf1a-c14a44be2444.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/226-b1a4f162-39bf-4d33-9930-0e2dddac622a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/226-b1a4f162-39bf-4d33-9930-0e2dddac622a.txn new file mode 100644 index 0000000000000000000000000000000000000000..26f0e33186621b02b2896686899622b3f8061f26 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/226-b1a4f162-39bf-4d33-9930-0e2dddac622a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/227-aaab2857-4bf0-44da-8721-2c6d39338213.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/227-aaab2857-4bf0-44da-8721-2c6d39338213.txn new file mode 100644 index 0000000000000000000000000000000000000000..9f7758e21c0e41b6de7366f908f5e14c5addcf3f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/227-aaab2857-4bf0-44da-8721-2c6d39338213.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/228-efdbc6f4-9956-4384-b7be-7bbfe7c5cf95.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/228-efdbc6f4-9956-4384-b7be-7bbfe7c5cf95.txn new file mode 100644 index 0000000000000000000000000000000000000000..80b689d695d7bd734786907049ed797b06ade0b0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/228-efdbc6f4-9956-4384-b7be-7bbfe7c5cf95.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/229-66fa3e6d-4c43-4860-88e2-ce4d4992d6d4.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/229-66fa3e6d-4c43-4860-88e2-ce4d4992d6d4.txn new file mode 100644 index 0000000000000000000000000000000000000000..7c9470b1a6bbbe59226177e316489d1c40255c33 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/229-66fa3e6d-4c43-4860-88e2-ce4d4992d6d4.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/23-87029e19-7620-4400-b586-f9364d70f847.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/23-87029e19-7620-4400-b586-f9364d70f847.txn new file mode 100644 index 0000000000000000000000000000000000000000..82b138a5f9fe9498efc614f9b7afcb939edf4441 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/23-87029e19-7620-4400-b586-f9364d70f847.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/230-30406c4e-9b61-4cd4-b78c-aefc53f26e37.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/230-30406c4e-9b61-4cd4-b78c-aefc53f26e37.txn new file mode 100644 index 0000000000000000000000000000000000000000..da3cf750ef02001913ffd5efe487a32715905de4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/230-30406c4e-9b61-4cd4-b78c-aefc53f26e37.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/231-67065f6e-2b77-45a0-9e92-8be5605f5bb2.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/231-67065f6e-2b77-45a0-9e92-8be5605f5bb2.txn new file mode 100644 index 0000000000000000000000000000000000000000..abc262570854b96fd3c5c6ead2cbf1bf32729781 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/231-67065f6e-2b77-45a0-9e92-8be5605f5bb2.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/232-3f5e0ad8-7e92-404b-ba2c-6610cacc4f94.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/232-3f5e0ad8-7e92-404b-ba2c-6610cacc4f94.txn new file mode 100644 index 0000000000000000000000000000000000000000..98c797494cfb9c6dfac82b290f36264bb4ad2b2f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/232-3f5e0ad8-7e92-404b-ba2c-6610cacc4f94.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/233-6d1fe7d0-5901-40d7-bb84-4d1a1762a5b2.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/233-6d1fe7d0-5901-40d7-bb84-4d1a1762a5b2.txn new file mode 100644 index 0000000000000000000000000000000000000000..fd8d62c6ebd1ebcb2747f581cb79dcf02c86b02d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/233-6d1fe7d0-5901-40d7-bb84-4d1a1762a5b2.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/234-d4745a54-16e9-4915-8530-d19ad9ec78da.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/234-d4745a54-16e9-4915-8530-d19ad9ec78da.txn new file mode 100644 index 0000000000000000000000000000000000000000..5618623c5c2b4eba2a37716007ffec3dcb1ca7cb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/234-d4745a54-16e9-4915-8530-d19ad9ec78da.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/235-aa56a7d8-c76a-46bf-8c52-95c6d70fe3d6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/235-aa56a7d8-c76a-46bf-8c52-95c6d70fe3d6.txn new file mode 100644 index 0000000000000000000000000000000000000000..02fb079c2a575bfcdfd900272a5bbef0f3d904f5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/235-aa56a7d8-c76a-46bf-8c52-95c6d70fe3d6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/236-fb7d4867-40ea-4825-bba0-8f53351de0cc.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/236-fb7d4867-40ea-4825-bba0-8f53351de0cc.txn new file mode 100644 index 0000000000000000000000000000000000000000..4c05ae2b22b9b985017f7125d7ef2e1deb3abeed Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/236-fb7d4867-40ea-4825-bba0-8f53351de0cc.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/237-12ab610b-62cb-41e7-82b7-ec4afb06fa63.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/237-12ab610b-62cb-41e7-82b7-ec4afb06fa63.txn new file mode 100644 index 0000000000000000000000000000000000000000..f6631f2b4215375c27b7855aca3ee0cb9cf25811 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/237-12ab610b-62cb-41e7-82b7-ec4afb06fa63.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/238-553ee0e2-5a07-4959-875e-f3bc1f6b09cd.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/238-553ee0e2-5a07-4959-875e-f3bc1f6b09cd.txn new file mode 100644 index 0000000000000000000000000000000000000000..3a98f312bddc8c7d7707b81af3c3df1a00ea3c6b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/238-553ee0e2-5a07-4959-875e-f3bc1f6b09cd.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/239-a6f37896-e0bd-4c49-8eef-169ff527cacd.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/239-a6f37896-e0bd-4c49-8eef-169ff527cacd.txn new file mode 100644 index 0000000000000000000000000000000000000000..ed1d6b5a5c1f9f2c24d566260fb9ed70979e2163 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/239-a6f37896-e0bd-4c49-8eef-169ff527cacd.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/24-3e636739-1a02-4f2a-8170-a6abb459b411.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/24-3e636739-1a02-4f2a-8170-a6abb459b411.txn new file mode 100644 index 0000000000000000000000000000000000000000..ead272239af55efd4adc5d8fd13f819ecc629a4e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/24-3e636739-1a02-4f2a-8170-a6abb459b411.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/240-f4f84cb8-424e-4fef-8435-5327cf6d5f60.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/240-f4f84cb8-424e-4fef-8435-5327cf6d5f60.txn new file mode 100644 index 0000000000000000000000000000000000000000..a9a8ced1213d64eb5d4e5d848ab9ba10a2d8687e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/240-f4f84cb8-424e-4fef-8435-5327cf6d5f60.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/241-ed3c8b62-6182-4b11-b55c-5161df1c993e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/241-ed3c8b62-6182-4b11-b55c-5161df1c993e.txn new file mode 100644 index 0000000000000000000000000000000000000000..0823ab2e3f4b4ea5cae3e15139a3815b43ebc34f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/241-ed3c8b62-6182-4b11-b55c-5161df1c993e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/242-a527d5a7-398f-485e-91df-69b2ced2f5d5.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/242-a527d5a7-398f-485e-91df-69b2ced2f5d5.txn new file mode 100644 index 0000000000000000000000000000000000000000..bb7b9c87751d1cf9bb87a0289f337c2197d1bbd6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/242-a527d5a7-398f-485e-91df-69b2ced2f5d5.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/243-4a5fd274-887f-4909-958f-f5c8b881070b.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/243-4a5fd274-887f-4909-958f-f5c8b881070b.txn new file mode 100644 index 0000000000000000000000000000000000000000..b576a76e5094e7c024f2276e33a0be310a9f255e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/243-4a5fd274-887f-4909-958f-f5c8b881070b.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/244-b66e1778-7e86-47a3-b1ff-06dfe4f63bc6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/244-b66e1778-7e86-47a3-b1ff-06dfe4f63bc6.txn new file mode 100644 index 0000000000000000000000000000000000000000..155be5b0625f31e5839b189aa31f1e6503120d07 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/244-b66e1778-7e86-47a3-b1ff-06dfe4f63bc6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/245-03dcf6d7-2f36-49a8-84e6-76357ad104c2.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/245-03dcf6d7-2f36-49a8-84e6-76357ad104c2.txn new file mode 100644 index 0000000000000000000000000000000000000000..ea2f6c93e9a36674496b5a83a26d2b1855320f8b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/245-03dcf6d7-2f36-49a8-84e6-76357ad104c2.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/246-1cc56273-09a7-4626-950a-f1873d4396d0.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/246-1cc56273-09a7-4626-950a-f1873d4396d0.txn new file mode 100644 index 0000000000000000000000000000000000000000..ba5b62af35d0e44d0f8c9552bf9ead0f1434d1c3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/246-1cc56273-09a7-4626-950a-f1873d4396d0.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/247-ba647873-8c8e-462d-9955-7bec552eab89.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/247-ba647873-8c8e-462d-9955-7bec552eab89.txn new file mode 100644 index 0000000000000000000000000000000000000000..ca51cdfc27c1c41d7745f391d11939c64407be09 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/247-ba647873-8c8e-462d-9955-7bec552eab89.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/248-1c756886-30fd-4e1d-a89b-0693c89804e6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/248-1c756886-30fd-4e1d-a89b-0693c89804e6.txn new file mode 100644 index 0000000000000000000000000000000000000000..99630cca216c6365f085aaacad5b31655ef8bd06 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/248-1c756886-30fd-4e1d-a89b-0693c89804e6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/249-5cee0711-6e2b-4cf7-af7c-26bcbc7b8773.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/249-5cee0711-6e2b-4cf7-af7c-26bcbc7b8773.txn new file mode 100644 index 0000000000000000000000000000000000000000..baf82f74da1b148223795d489fdd174b9812cdff Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/249-5cee0711-6e2b-4cf7-af7c-26bcbc7b8773.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/25-76cdb4eb-a02e-4b3d-955a-d9603c73a063.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/25-76cdb4eb-a02e-4b3d-955a-d9603c73a063.txn new file mode 100644 index 0000000000000000000000000000000000000000..0e923d3f8f70a5ba3697b78e3265b890b6238094 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/25-76cdb4eb-a02e-4b3d-955a-d9603c73a063.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/250-936a6ea4-3a36-495c-af6a-8071becdcdb8.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/250-936a6ea4-3a36-495c-af6a-8071becdcdb8.txn new file mode 100644 index 0000000000000000000000000000000000000000..8d2795b5af72252c185f67ca97bd73f8442fdcef Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/250-936a6ea4-3a36-495c-af6a-8071becdcdb8.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/251-97a4bdf2-fc97-46fc-87bf-9b10fb0af8c8.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/251-97a4bdf2-fc97-46fc-87bf-9b10fb0af8c8.txn new file mode 100644 index 0000000000000000000000000000000000000000..c30a9db545013c2f83cc1a8ec36f83b41d0e8c58 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/251-97a4bdf2-fc97-46fc-87bf-9b10fb0af8c8.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/252-7d122ac5-55ce-4515-8c8f-68a5b69df708.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/252-7d122ac5-55ce-4515-8c8f-68a5b69df708.txn new file mode 100644 index 0000000000000000000000000000000000000000..4fb6a0e0628fd3a65368d45222e3525313da1f11 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/252-7d122ac5-55ce-4515-8c8f-68a5b69df708.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/253-08f718ee-ed7e-465f-8e5a-9c5454b156e0.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/253-08f718ee-ed7e-465f-8e5a-9c5454b156e0.txn new file mode 100644 index 0000000000000000000000000000000000000000..40cfec04f87ab9949529d697d41ff004a31c730f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/253-08f718ee-ed7e-465f-8e5a-9c5454b156e0.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/254-69ec232a-2262-4ec8-8e17-e7f6d3e37a88.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/254-69ec232a-2262-4ec8-8e17-e7f6d3e37a88.txn new file mode 100644 index 0000000000000000000000000000000000000000..9b53f389af8c0a27c3d620f2c75be40cc9d80945 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/254-69ec232a-2262-4ec8-8e17-e7f6d3e37a88.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/255-dc55b18d-07ac-4588-9b9c-7e3a338c5f18.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/255-dc55b18d-07ac-4588-9b9c-7e3a338c5f18.txn new file mode 100644 index 0000000000000000000000000000000000000000..1beca1ae1899ec08ff35c5ffaf1b28bce93cd7c1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/255-dc55b18d-07ac-4588-9b9c-7e3a338c5f18.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/256-af733fb1-5278-4f16-8487-39bd37ccde33.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/256-af733fb1-5278-4f16-8487-39bd37ccde33.txn new file mode 100644 index 0000000000000000000000000000000000000000..0c8fb452cb441cb3d3f1291e3049475da37041fc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/256-af733fb1-5278-4f16-8487-39bd37ccde33.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/257-43eb56fe-86de-408b-ab58-22bfcda9a3de.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/257-43eb56fe-86de-408b-ab58-22bfcda9a3de.txn new file mode 100644 index 0000000000000000000000000000000000000000..c0b9775c727bf590057dacf39d5efa4f91de8a21 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/257-43eb56fe-86de-408b-ab58-22bfcda9a3de.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/258-6b561605-6b68-424c-9dbe-7a7065b28a0c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/258-6b561605-6b68-424c-9dbe-7a7065b28a0c.txn new file mode 100644 index 0000000000000000000000000000000000000000..527069628128f017d75199708b6301465ea34ff8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/258-6b561605-6b68-424c-9dbe-7a7065b28a0c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/259-ee2187b2-4b46-4722-923c-fd31747a1923.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/259-ee2187b2-4b46-4722-923c-fd31747a1923.txn new file mode 100644 index 0000000000000000000000000000000000000000..7a1a2531f5460d493a59f5048bb034b590694586 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/259-ee2187b2-4b46-4722-923c-fd31747a1923.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/26-a749fc9f-0b8a-474b-916a-4c6ea1be1520.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/26-a749fc9f-0b8a-474b-916a-4c6ea1be1520.txn new file mode 100644 index 0000000000000000000000000000000000000000..bc3ea1db53d488eb60daf0fd0236e7eb5c721159 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/26-a749fc9f-0b8a-474b-916a-4c6ea1be1520.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/260-ce3260a1-dbed-420c-8ec2-df4b3e68d4f3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/260-ce3260a1-dbed-420c-8ec2-df4b3e68d4f3.txn new file mode 100644 index 0000000000000000000000000000000000000000..e4f1f2fbd910720084f15a9507c417e2a53845b7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/260-ce3260a1-dbed-420c-8ec2-df4b3e68d4f3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/261-5a36e761-591d-40eb-b8b1-7764fc517687.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/261-5a36e761-591d-40eb-b8b1-7764fc517687.txn new file mode 100644 index 0000000000000000000000000000000000000000..0d535d87c3de03229f6ea564238657380f0fb4af Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/261-5a36e761-591d-40eb-b8b1-7764fc517687.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/262-4ff64291-3067-4b74-a16c-d60cf57d1989.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/262-4ff64291-3067-4b74-a16c-d60cf57d1989.txn new file mode 100644 index 0000000000000000000000000000000000000000..e94bcd45773cd1cf41c20f585acb9293be190f10 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/262-4ff64291-3067-4b74-a16c-d60cf57d1989.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/263-76ab048b-3957-4e9c-b654-590ed751f3d3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/263-76ab048b-3957-4e9c-b654-590ed751f3d3.txn new file mode 100644 index 0000000000000000000000000000000000000000..cdfe9da894e53b99d94a306d7da15ceb81c92832 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/263-76ab048b-3957-4e9c-b654-590ed751f3d3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/264-e2707481-d1bb-467b-9813-5801068b4c40.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/264-e2707481-d1bb-467b-9813-5801068b4c40.txn new file mode 100644 index 0000000000000000000000000000000000000000..bdcd74ebe98ec611e4207ad55a350e1c76b28372 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/264-e2707481-d1bb-467b-9813-5801068b4c40.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/265-f5362ca2-3848-428e-bc07-c5aa4eec6fac.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/265-f5362ca2-3848-428e-bc07-c5aa4eec6fac.txn new file mode 100644 index 0000000000000000000000000000000000000000..80f718b69ad97e4c1955ef6284cef185f29a72d4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/265-f5362ca2-3848-428e-bc07-c5aa4eec6fac.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/266-2ca160a1-de6f-4641-a522-6b5cf68c315e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/266-2ca160a1-de6f-4641-a522-6b5cf68c315e.txn new file mode 100644 index 0000000000000000000000000000000000000000..ceed38f4af75ae3fe82e3f0dbf489fac84482eaf Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/266-2ca160a1-de6f-4641-a522-6b5cf68c315e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/267-8e0940e5-bbbe-4bc7-8023-44267f58b9d9.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/267-8e0940e5-bbbe-4bc7-8023-44267f58b9d9.txn new file mode 100644 index 0000000000000000000000000000000000000000..54272112db9e22773c3541a2a91dc403a45e1ccf Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/267-8e0940e5-bbbe-4bc7-8023-44267f58b9d9.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/268-8a6eb20c-c367-4827-bfc3-b3536a1641f1.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/268-8a6eb20c-c367-4827-bfc3-b3536a1641f1.txn new file mode 100644 index 0000000000000000000000000000000000000000..d0e100c79417f952f764d914969ecd087ef0241c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/268-8a6eb20c-c367-4827-bfc3-b3536a1641f1.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/269-dff201a4-8238-4204-ae8d-963063051487.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/269-dff201a4-8238-4204-ae8d-963063051487.txn new file mode 100644 index 0000000000000000000000000000000000000000..6a5652fd4a343bfb7af33cbd88265847c73a55f4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/269-dff201a4-8238-4204-ae8d-963063051487.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/27-8cfb4cfb-a48c-4aeb-a53b-bf9f29e46854.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/27-8cfb4cfb-a48c-4aeb-a53b-bf9f29e46854.txn new file mode 100644 index 0000000000000000000000000000000000000000..559d2d0cc2938ea75170b16d2049624c52ab51a3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/27-8cfb4cfb-a48c-4aeb-a53b-bf9f29e46854.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/270-9c0cd5d0-6d22-4aa3-a065-1b2e4ba973ef.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/270-9c0cd5d0-6d22-4aa3-a065-1b2e4ba973ef.txn new file mode 100644 index 0000000000000000000000000000000000000000..b31f22509bdfc0423c6e2d0bea7f14cf7f63040f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/270-9c0cd5d0-6d22-4aa3-a065-1b2e4ba973ef.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/271-341a1469-95c1-42d3-806e-9abb34a496a8.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/271-341a1469-95c1-42d3-806e-9abb34a496a8.txn new file mode 100644 index 0000000000000000000000000000000000000000..39a920401ddaf1961f40ad5979749b2dc24a93b4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/271-341a1469-95c1-42d3-806e-9abb34a496a8.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/272-affb0a9e-edfb-4f3f-8c31-deeae695f192.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/272-affb0a9e-edfb-4f3f-8c31-deeae695f192.txn new file mode 100644 index 0000000000000000000000000000000000000000..41f6a6e6b9b311ddbc16b74a153e8f34e3cf672f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/272-affb0a9e-edfb-4f3f-8c31-deeae695f192.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/273-f06d02c9-08d9-430f-adc6-64d732985ffc.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/273-f06d02c9-08d9-430f-adc6-64d732985ffc.txn new file mode 100644 index 0000000000000000000000000000000000000000..b17159d5e97bb17e6435230aeb116e68c3e7ed1a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/273-f06d02c9-08d9-430f-adc6-64d732985ffc.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/274-7fc88b06-e73d-4810-bd65-1a88e68799f9.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/274-7fc88b06-e73d-4810-bd65-1a88e68799f9.txn new file mode 100644 index 0000000000000000000000000000000000000000..2e5d0a2ebd79765240ee90d5a821bbffaa774a68 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/274-7fc88b06-e73d-4810-bd65-1a88e68799f9.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/275-72e08d13-9b2e-4fd5-b39d-2d434a430e14.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/275-72e08d13-9b2e-4fd5-b39d-2d434a430e14.txn new file mode 100644 index 0000000000000000000000000000000000000000..e43c59754f3852ab8b2b8dc4848fa403e687fd30 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/275-72e08d13-9b2e-4fd5-b39d-2d434a430e14.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/276-fbb6c784-3a3f-4d55-ade9-37b55d5764fd.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/276-fbb6c784-3a3f-4d55-ade9-37b55d5764fd.txn new file mode 100644 index 0000000000000000000000000000000000000000..30d56004a9fd5b30da3df574d57bdff5d82beb19 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/276-fbb6c784-3a3f-4d55-ade9-37b55d5764fd.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/277-ed22d511-42e5-4d98-a443-c6b3397e7523.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/277-ed22d511-42e5-4d98-a443-c6b3397e7523.txn new file mode 100644 index 0000000000000000000000000000000000000000..e2e6c35247ab84cec7d1324c9fffb2251f91b5d8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/277-ed22d511-42e5-4d98-a443-c6b3397e7523.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/278-3fe0baa6-5aea-41ab-b29f-0178c0e79951.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/278-3fe0baa6-5aea-41ab-b29f-0178c0e79951.txn new file mode 100644 index 0000000000000000000000000000000000000000..605847f8c1da3a5724f4d5ca20a7af56b1f054af Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/278-3fe0baa6-5aea-41ab-b29f-0178c0e79951.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/279-37483963-8ccb-45b1-91be-f16c6ff32e6e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/279-37483963-8ccb-45b1-91be-f16c6ff32e6e.txn new file mode 100644 index 0000000000000000000000000000000000000000..d301f34edbd55c7945b798f98608b3d696df15ba Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/279-37483963-8ccb-45b1-91be-f16c6ff32e6e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/28-ef189cb8-f727-48a4-99f1-56f2e1b39231.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/28-ef189cb8-f727-48a4-99f1-56f2e1b39231.txn new file mode 100644 index 0000000000000000000000000000000000000000..401d25d6ed59e78c56fa028ad03889adf6a7aa52 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/28-ef189cb8-f727-48a4-99f1-56f2e1b39231.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/280-8d5db27d-7d3e-4cb5-bdf0-78281b222f31.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/280-8d5db27d-7d3e-4cb5-bdf0-78281b222f31.txn new file mode 100644 index 0000000000000000000000000000000000000000..8a8721607f81360f7ab9ef994d2bcb46d1950216 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/280-8d5db27d-7d3e-4cb5-bdf0-78281b222f31.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/281-39fc147b-5209-4c16-9fe2-354f5b7b9a5c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/281-39fc147b-5209-4c16-9fe2-354f5b7b9a5c.txn new file mode 100644 index 0000000000000000000000000000000000000000..e17b152440e744465e33dc763dc5c65b0587e792 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/281-39fc147b-5209-4c16-9fe2-354f5b7b9a5c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/282-1b213a98-1130-4c9f-83e2-3ddcb841f713.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/282-1b213a98-1130-4c9f-83e2-3ddcb841f713.txn new file mode 100644 index 0000000000000000000000000000000000000000..e87cc1d881ebb3dec167d9020a83b8e19dc0cbe8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/282-1b213a98-1130-4c9f-83e2-3ddcb841f713.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/283-b4e71adf-c5a9-4f76-a7fd-998f3661c02a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/283-b4e71adf-c5a9-4f76-a7fd-998f3661c02a.txn new file mode 100644 index 0000000000000000000000000000000000000000..fac78d2c41f7a41dd58b7360dde3bab7c4409c39 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/283-b4e71adf-c5a9-4f76-a7fd-998f3661c02a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/284-2023e103-4e34-4233-8756-805574fdfde1.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/284-2023e103-4e34-4233-8756-805574fdfde1.txn new file mode 100644 index 0000000000000000000000000000000000000000..3b539355b3aa1751831f9172c0a346bd2f425070 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/284-2023e103-4e34-4233-8756-805574fdfde1.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/285-16525740-92f3-4246-8281-ff402c9f293c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/285-16525740-92f3-4246-8281-ff402c9f293c.txn new file mode 100644 index 0000000000000000000000000000000000000000..bcdc4fb4a334b4aa8f613fa1ea33176fcd0e6e9e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/285-16525740-92f3-4246-8281-ff402c9f293c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/286-3a22ba1d-a4ba-4ccb-bb41-e7f485b998e1.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/286-3a22ba1d-a4ba-4ccb-bb41-e7f485b998e1.txn new file mode 100644 index 0000000000000000000000000000000000000000..2fd96fbbcbb2907bb07ca5c7ce713ccb703f72d1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/286-3a22ba1d-a4ba-4ccb-bb41-e7f485b998e1.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/287-9250d79b-94ad-4c75-a6fe-c03de6c4b90b.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/287-9250d79b-94ad-4c75-a6fe-c03de6c4b90b.txn new file mode 100644 index 0000000000000000000000000000000000000000..88f1c2dba4bf63111f0ea80015a73dd9299fe82f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/287-9250d79b-94ad-4c75-a6fe-c03de6c4b90b.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/288-cf165c03-9b09-48d7-bd4a-99201861e378.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/288-cf165c03-9b09-48d7-bd4a-99201861e378.txn new file mode 100644 index 0000000000000000000000000000000000000000..52085cf90dcd810e09084edcc149591bd115f2c9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/288-cf165c03-9b09-48d7-bd4a-99201861e378.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/289-e759bdb1-c546-4172-8d6d-edd6717b6c9a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/289-e759bdb1-c546-4172-8d6d-edd6717b6c9a.txn new file mode 100644 index 0000000000000000000000000000000000000000..e64e8f1f676ef4fb6d6643efa442eef3b0cb7ae6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/289-e759bdb1-c546-4172-8d6d-edd6717b6c9a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/29-4e2d71b9-740d-4ab2-895a-516c7c167f5e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/29-4e2d71b9-740d-4ab2-895a-516c7c167f5e.txn new file mode 100644 index 0000000000000000000000000000000000000000..e1e06eed78852459e97a55647b9c1edf93779849 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/29-4e2d71b9-740d-4ab2-895a-516c7c167f5e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/290-72a47744-fccf-4b03-af1a-4731050242c4.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/290-72a47744-fccf-4b03-af1a-4731050242c4.txn new file mode 100644 index 0000000000000000000000000000000000000000..f0fa36da9d2dabd67c401f350dda4a8842ea2ff9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/290-72a47744-fccf-4b03-af1a-4731050242c4.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/291-0d24478e-82c1-4df7-913c-5751f8a13af7.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/291-0d24478e-82c1-4df7-913c-5751f8a13af7.txn new file mode 100644 index 0000000000000000000000000000000000000000..dd002235fc93252b8f464a35386cef09088a3bb1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/291-0d24478e-82c1-4df7-913c-5751f8a13af7.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/292-ff6cdddc-4079-4631-9b36-1b5792fc13f0.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/292-ff6cdddc-4079-4631-9b36-1b5792fc13f0.txn new file mode 100644 index 0000000000000000000000000000000000000000..89b89e56508714f10465731f3b9bd4114a8db7c3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/292-ff6cdddc-4079-4631-9b36-1b5792fc13f0.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/293-c19bc921-2aaa-4cd0-8168-3af15a8b6d19.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/293-c19bc921-2aaa-4cd0-8168-3af15a8b6d19.txn new file mode 100644 index 0000000000000000000000000000000000000000..51f2177921d5dfabf80a78faf28b699d49fd5514 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/293-c19bc921-2aaa-4cd0-8168-3af15a8b6d19.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/294-cd263eff-b1a8-4e8d-a072-d2a314b11eae.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/294-cd263eff-b1a8-4e8d-a072-d2a314b11eae.txn new file mode 100644 index 0000000000000000000000000000000000000000..0f4a3534064a9b0c9facf748f8a6ce1f399ee020 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/294-cd263eff-b1a8-4e8d-a072-d2a314b11eae.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/295-8b211b18-29e9-479d-8634-a67cae239346.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/295-8b211b18-29e9-479d-8634-a67cae239346.txn new file mode 100644 index 0000000000000000000000000000000000000000..c70c4628a6dead83c8f81fd747448e072bef7436 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/295-8b211b18-29e9-479d-8634-a67cae239346.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/296-1bb62319-16c1-4b4e-89bd-21caa91d0ab5.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/296-1bb62319-16c1-4b4e-89bd-21caa91d0ab5.txn new file mode 100644 index 0000000000000000000000000000000000000000..8317462b9516850223c8a6fd37b2a04903adcb21 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/296-1bb62319-16c1-4b4e-89bd-21caa91d0ab5.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/297-02c378dc-0ee6-4b64-943f-73879ce53515.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/297-02c378dc-0ee6-4b64-943f-73879ce53515.txn new file mode 100644 index 0000000000000000000000000000000000000000..53c2d17615ffc0693027ecd8071d40a14e45d215 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/297-02c378dc-0ee6-4b64-943f-73879ce53515.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/298-3db034d3-9caa-4f95-b652-c75288f85392.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/298-3db034d3-9caa-4f95-b652-c75288f85392.txn new file mode 100644 index 0000000000000000000000000000000000000000..9f8b23890ab715a100ace811e383dd8c4791ef07 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/298-3db034d3-9caa-4f95-b652-c75288f85392.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/299-8948f321-f972-43b0-a1a5-506fa5db7b5e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/299-8948f321-f972-43b0-a1a5-506fa5db7b5e.txn new file mode 100644 index 0000000000000000000000000000000000000000..568a0828684ae7bf82b0267264fa39f1bc72e2d2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/299-8948f321-f972-43b0-a1a5-506fa5db7b5e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/3-0e784f96-ef81-4a03-bceb-2f85bc7fcacc.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/3-0e784f96-ef81-4a03-bceb-2f85bc7fcacc.txn new file mode 100644 index 0000000000000000000000000000000000000000..db99e777cbd99a082ad38c2e85cc52f8d09eb8f1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/3-0e784f96-ef81-4a03-bceb-2f85bc7fcacc.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/30-66958194-1994-4377-bf41-4560f77de097.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/30-66958194-1994-4377-bf41-4560f77de097.txn new file mode 100644 index 0000000000000000000000000000000000000000..038be139c3275d80dad3bf52739abea5c87a7638 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/30-66958194-1994-4377-bf41-4560f77de097.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/300-f6629e78-4d95-4451-9dfa-1954d41dc174.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/300-f6629e78-4d95-4451-9dfa-1954d41dc174.txn new file mode 100644 index 0000000000000000000000000000000000000000..25fb61bb0ff4e77ff3be7f338e8bd01697195e6a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/300-f6629e78-4d95-4451-9dfa-1954d41dc174.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/301-4598c3fa-7f9f-4cd0-8160-414d15652413.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/301-4598c3fa-7f9f-4cd0-8160-414d15652413.txn new file mode 100644 index 0000000000000000000000000000000000000000..fe1a98378572ef0e5cb857df83291ed9e807d27b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/301-4598c3fa-7f9f-4cd0-8160-414d15652413.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/302-368d9402-8a89-49cf-876f-fd6dc068813f.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/302-368d9402-8a89-49cf-876f-fd6dc068813f.txn new file mode 100644 index 0000000000000000000000000000000000000000..b2e8daaf587ee09e5bdf2f864eb92dbc714d201e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/302-368d9402-8a89-49cf-876f-fd6dc068813f.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/303-80a9f097-85d1-4c16-8d52-108b6d2906b3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/303-80a9f097-85d1-4c16-8d52-108b6d2906b3.txn new file mode 100644 index 0000000000000000000000000000000000000000..96e6e864e67421aba2f6f9a531b3377666d28a36 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/303-80a9f097-85d1-4c16-8d52-108b6d2906b3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/304-5128247e-8de1-49ec-bb1e-c7ae1973f234.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/304-5128247e-8de1-49ec-bb1e-c7ae1973f234.txn new file mode 100644 index 0000000000000000000000000000000000000000..c3ca09d35a4a8df8a0fa92a5c9f7391100b540ab Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/304-5128247e-8de1-49ec-bb1e-c7ae1973f234.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/305-3d8169e9-b041-480f-a116-fe18db8aa415.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/305-3d8169e9-b041-480f-a116-fe18db8aa415.txn new file mode 100644 index 0000000000000000000000000000000000000000..e8d36aace6da40a29ea9a83bc7757f485d7b073d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/305-3d8169e9-b041-480f-a116-fe18db8aa415.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/306-2e365356-1731-4a84-bed9-8cab0c2f3d6f.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/306-2e365356-1731-4a84-bed9-8cab0c2f3d6f.txn new file mode 100644 index 0000000000000000000000000000000000000000..44f96f2366b134052758d594c82694b71863ebb5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/306-2e365356-1731-4a84-bed9-8cab0c2f3d6f.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/307-e6e45991-b34d-4335-ad40-d102f5601ce5.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/307-e6e45991-b34d-4335-ad40-d102f5601ce5.txn new file mode 100644 index 0000000000000000000000000000000000000000..caf1538029d5729f4704b0a2e3885347127b9c4d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/307-e6e45991-b34d-4335-ad40-d102f5601ce5.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/308-f1c9352d-77b9-4811-8926-0e8887060100.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/308-f1c9352d-77b9-4811-8926-0e8887060100.txn new file mode 100644 index 0000000000000000000000000000000000000000..c496a434da1a3d3035182244362e51f1f37330c6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/308-f1c9352d-77b9-4811-8926-0e8887060100.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/309-5c7a7c17-303c-414b-a936-a3867719e660.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/309-5c7a7c17-303c-414b-a936-a3867719e660.txn new file mode 100644 index 0000000000000000000000000000000000000000..91628743df91744b521327c46794930e4e9418bd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/309-5c7a7c17-303c-414b-a936-a3867719e660.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/31-75007bbb-0af6-4218-880b-5c786349421c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/31-75007bbb-0af6-4218-880b-5c786349421c.txn new file mode 100644 index 0000000000000000000000000000000000000000..8d37ea63c91fdb6b38b26b6077e94ea88ae23718 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/31-75007bbb-0af6-4218-880b-5c786349421c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/310-1ed6b650-b8f1-4b34-9a7e-8d1d76c5b9d3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/310-1ed6b650-b8f1-4b34-9a7e-8d1d76c5b9d3.txn new file mode 100644 index 0000000000000000000000000000000000000000..0d783d197a2f2e7b7ad774e5af6f11458ef19c58 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/310-1ed6b650-b8f1-4b34-9a7e-8d1d76c5b9d3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/311-9f126640-93ad-49ec-8f88-323ae2a985ef.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/311-9f126640-93ad-49ec-8f88-323ae2a985ef.txn new file mode 100644 index 0000000000000000000000000000000000000000..86d8daadac5d23c3b197dad62897c7e1ea6bc682 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/311-9f126640-93ad-49ec-8f88-323ae2a985ef.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/312-53878dc1-d10d-4f80-b18b-9a02cdad17e4.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/312-53878dc1-d10d-4f80-b18b-9a02cdad17e4.txn new file mode 100644 index 0000000000000000000000000000000000000000..bb8fb954c20b0044106eb62666014b879b35ac58 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/312-53878dc1-d10d-4f80-b18b-9a02cdad17e4.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/313-3ac2fe20-806f-4bf0-ad1c-d23e8c0b561c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/313-3ac2fe20-806f-4bf0-ad1c-d23e8c0b561c.txn new file mode 100644 index 0000000000000000000000000000000000000000..af0a4dbd38a1119cd7e89c7adb7c7d742b013340 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/313-3ac2fe20-806f-4bf0-ad1c-d23e8c0b561c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/314-dcf777d8-b001-4fa1-bdee-c3bcd2760e57.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/314-dcf777d8-b001-4fa1-bdee-c3bcd2760e57.txn new file mode 100644 index 0000000000000000000000000000000000000000..0e7a5a53882b3880909353489bfc10a54a442df9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/314-dcf777d8-b001-4fa1-bdee-c3bcd2760e57.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/315-1a2e64ab-dc56-41df-82ca-266bfbc4554f.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/315-1a2e64ab-dc56-41df-82ca-266bfbc4554f.txn new file mode 100644 index 0000000000000000000000000000000000000000..23d2d452e7096547c3b7d1ca17d8099f4c6171b0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/315-1a2e64ab-dc56-41df-82ca-266bfbc4554f.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/316-4eaf7765-3a52-491e-8fb9-77f480f2415d.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/316-4eaf7765-3a52-491e-8fb9-77f480f2415d.txn new file mode 100644 index 0000000000000000000000000000000000000000..6e7a94e0f63d31efe01eae44343fc713931c2ffc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/316-4eaf7765-3a52-491e-8fb9-77f480f2415d.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/317-666b5d61-217d-42d7-9265-17fd498e4543.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/317-666b5d61-217d-42d7-9265-17fd498e4543.txn new file mode 100644 index 0000000000000000000000000000000000000000..dc7a0cd71bad66fcd5a1e5e3cdc00995698aa334 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/317-666b5d61-217d-42d7-9265-17fd498e4543.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/318-7b83aaa0-b5b4-42d0-87d2-44ed9017b81b.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/318-7b83aaa0-b5b4-42d0-87d2-44ed9017b81b.txn new file mode 100644 index 0000000000000000000000000000000000000000..ae3483ca68c71ed79f2072d1187010d54c05b6d0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/318-7b83aaa0-b5b4-42d0-87d2-44ed9017b81b.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/319-0bc40deb-e9c3-4a34-9ba1-c5421565be27.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/319-0bc40deb-e9c3-4a34-9ba1-c5421565be27.txn new file mode 100644 index 0000000000000000000000000000000000000000..5917930dd3141e69262dd72cddc9459c82d3cda6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/319-0bc40deb-e9c3-4a34-9ba1-c5421565be27.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/32-82077ded-668a-4625-a84f-0877508d5f09.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/32-82077ded-668a-4625-a84f-0877508d5f09.txn new file mode 100644 index 0000000000000000000000000000000000000000..3b386a8cb4f68dec8324c170ad1af134c5be4021 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/32-82077ded-668a-4625-a84f-0877508d5f09.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/320-5a28984a-8e11-4e54-9cf7-f378c29bb4f6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/320-5a28984a-8e11-4e54-9cf7-f378c29bb4f6.txn new file mode 100644 index 0000000000000000000000000000000000000000..e4185763bdf4eb333eebba765f08d977aaed0ea2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/320-5a28984a-8e11-4e54-9cf7-f378c29bb4f6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/321-cf4deda8-d9d2-4c62-ae5e-8724e58761f1.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/321-cf4deda8-d9d2-4c62-ae5e-8724e58761f1.txn new file mode 100644 index 0000000000000000000000000000000000000000..b5d8a3101f57161c23854a8591786da8e5b49ddc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/321-cf4deda8-d9d2-4c62-ae5e-8724e58761f1.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/322-b0f8a3fa-a9a9-4c5a-864b-bc38d487a954.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/322-b0f8a3fa-a9a9-4c5a-864b-bc38d487a954.txn new file mode 100644 index 0000000000000000000000000000000000000000..3bd90f4191bec9da1b340d0630338c6927a65d01 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/322-b0f8a3fa-a9a9-4c5a-864b-bc38d487a954.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/323-ec191acb-9fc2-4633-8138-1a7f63d68140.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/323-ec191acb-9fc2-4633-8138-1a7f63d68140.txn new file mode 100644 index 0000000000000000000000000000000000000000..8d5dacc56e290e72d9d448fb2cacc63d6843dddf Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/323-ec191acb-9fc2-4633-8138-1a7f63d68140.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/324-e53c385a-4969-4533-8f79-c02610c0705c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/324-e53c385a-4969-4533-8f79-c02610c0705c.txn new file mode 100644 index 0000000000000000000000000000000000000000..459e3637d65b688200292890da2c200e4da18339 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/324-e53c385a-4969-4533-8f79-c02610c0705c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/325-1b51cd9d-2cd5-4ca9-a011-ef50df951d1a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/325-1b51cd9d-2cd5-4ca9-a011-ef50df951d1a.txn new file mode 100644 index 0000000000000000000000000000000000000000..361dcebb7b2e80ee53041ff575bfd6899f1c9610 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/325-1b51cd9d-2cd5-4ca9-a011-ef50df951d1a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/326-a82d8253-71ee-4e67-aa9d-1d340f93a894.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/326-a82d8253-71ee-4e67-aa9d-1d340f93a894.txn new file mode 100644 index 0000000000000000000000000000000000000000..19bdd365773ec3d1e6f2195aa60ef3c890e738a9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/326-a82d8253-71ee-4e67-aa9d-1d340f93a894.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/327-1d636db1-1bb7-457c-a167-3b5dcf706021.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/327-1d636db1-1bb7-457c-a167-3b5dcf706021.txn new file mode 100644 index 0000000000000000000000000000000000000000..f28022d08b15c5a7342f6ea7db6b5bb80106ddc6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/327-1d636db1-1bb7-457c-a167-3b5dcf706021.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/328-d60b0b42-d7d4-491e-9936-2fb5b42014b5.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/328-d60b0b42-d7d4-491e-9936-2fb5b42014b5.txn new file mode 100644 index 0000000000000000000000000000000000000000..75ee48568a31ebeaf18f8915fdb6d40ae06d7170 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/328-d60b0b42-d7d4-491e-9936-2fb5b42014b5.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/329-96aaace6-145e-4efa-bfab-de331d85a860.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/329-96aaace6-145e-4efa-bfab-de331d85a860.txn new file mode 100644 index 0000000000000000000000000000000000000000..a72628ea18900c62f492e61d513d6ecf0cc0dd93 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/329-96aaace6-145e-4efa-bfab-de331d85a860.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/33-1e95d94f-feee-4092-8767-b6d61b1db997.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/33-1e95d94f-feee-4092-8767-b6d61b1db997.txn new file mode 100644 index 0000000000000000000000000000000000000000..2b6c2eeb55a33643c186756c19d7ed5a9d274fd3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/33-1e95d94f-feee-4092-8767-b6d61b1db997.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/330-5c89d701-9571-45e3-89a7-29735377abe7.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/330-5c89d701-9571-45e3-89a7-29735377abe7.txn new file mode 100644 index 0000000000000000000000000000000000000000..a2fca1bd0ba186008b7225e1e331f8714c950585 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/330-5c89d701-9571-45e3-89a7-29735377abe7.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/331-8f2dbfb8-dd4b-4ce8-a484-49bdc2e8da92.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/331-8f2dbfb8-dd4b-4ce8-a484-49bdc2e8da92.txn new file mode 100644 index 0000000000000000000000000000000000000000..b7bd747ee8b6fc255e226d71d61dd1723800a0b1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/331-8f2dbfb8-dd4b-4ce8-a484-49bdc2e8da92.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/332-ed0c1fac-69cc-4ca6-b2ce-6eee15d89866.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/332-ed0c1fac-69cc-4ca6-b2ce-6eee15d89866.txn new file mode 100644 index 0000000000000000000000000000000000000000..89de8bd39b633225b32184efca89c380d086b508 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/332-ed0c1fac-69cc-4ca6-b2ce-6eee15d89866.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/333-bfb84b78-44e1-48b2-a5bc-ec6cfdfc7eec.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/333-bfb84b78-44e1-48b2-a5bc-ec6cfdfc7eec.txn new file mode 100644 index 0000000000000000000000000000000000000000..c2120ae27f39f9144e625ece22793185a063d653 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/333-bfb84b78-44e1-48b2-a5bc-ec6cfdfc7eec.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/334-4ed98df2-0478-4c94-9810-d2de9650b69f.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/334-4ed98df2-0478-4c94-9810-d2de9650b69f.txn new file mode 100644 index 0000000000000000000000000000000000000000..a189ad0e56f50c46d672d3bf81c7d9f2302086e5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/334-4ed98df2-0478-4c94-9810-d2de9650b69f.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/335-b8f51ef5-1675-4e0d-94e8-d53b4fab9048.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/335-b8f51ef5-1675-4e0d-94e8-d53b4fab9048.txn new file mode 100644 index 0000000000000000000000000000000000000000..fe56a56d231b5f3dea3ab26940e177a9b40e47d2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/335-b8f51ef5-1675-4e0d-94e8-d53b4fab9048.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/336-c9733f5a-8ad6-489d-9a07-ee94f3638b8b.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/336-c9733f5a-8ad6-489d-9a07-ee94f3638b8b.txn new file mode 100644 index 0000000000000000000000000000000000000000..d66bd5b31d381f9536b4c37c55579228978f186c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/336-c9733f5a-8ad6-489d-9a07-ee94f3638b8b.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/337-7819862e-620a-45c7-bdcd-a9dc790c4bbb.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/337-7819862e-620a-45c7-bdcd-a9dc790c4bbb.txn new file mode 100644 index 0000000000000000000000000000000000000000..569cbb1518a5ed679233c02f4b78e3745a5dd977 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/337-7819862e-620a-45c7-bdcd-a9dc790c4bbb.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/338-05d02e08-1f2c-4b04-ab4a-2387aa71ebcb.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/338-05d02e08-1f2c-4b04-ab4a-2387aa71ebcb.txn new file mode 100644 index 0000000000000000000000000000000000000000..89e73b967725e8772ab8080f735280e960a1f629 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/338-05d02e08-1f2c-4b04-ab4a-2387aa71ebcb.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/339-afd82058-55a3-4385-8624-7f37e53ef823.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/339-afd82058-55a3-4385-8624-7f37e53ef823.txn new file mode 100644 index 0000000000000000000000000000000000000000..5376ff840f96b6327c5374b680c5fc5ea32ff636 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/339-afd82058-55a3-4385-8624-7f37e53ef823.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/34-a66713f7-2d27-450f-b62a-b670f11c0178.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/34-a66713f7-2d27-450f-b62a-b670f11c0178.txn new file mode 100644 index 0000000000000000000000000000000000000000..a8da705631993b7b612d88e24b8de6989c39f90b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/34-a66713f7-2d27-450f-b62a-b670f11c0178.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/340-2f486890-4fc9-4f75-83bc-c2cb8ef5b130.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/340-2f486890-4fc9-4f75-83bc-c2cb8ef5b130.txn new file mode 100644 index 0000000000000000000000000000000000000000..9a6a699ea2c5dc87dc9909db8f7638c9fe3cb86a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/340-2f486890-4fc9-4f75-83bc-c2cb8ef5b130.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/341-7ef2c01f-1749-4bdb-b77f-890625d2997a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/341-7ef2c01f-1749-4bdb-b77f-890625d2997a.txn new file mode 100644 index 0000000000000000000000000000000000000000..64aed472f81538a01f68f6ce86839174023f0e3c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/341-7ef2c01f-1749-4bdb-b77f-890625d2997a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/342-f24d7d0d-d12e-47f5-a1e2-dbf9a7be4de3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/342-f24d7d0d-d12e-47f5-a1e2-dbf9a7be4de3.txn new file mode 100644 index 0000000000000000000000000000000000000000..e5052fab3171d9d434642e06f6651fede989ce55 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/342-f24d7d0d-d12e-47f5-a1e2-dbf9a7be4de3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/343-0b70d235-44ef-4b15-ae61-65032e4f6cb6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/343-0b70d235-44ef-4b15-ae61-65032e4f6cb6.txn new file mode 100644 index 0000000000000000000000000000000000000000..be6d96e94e1c9d64c0fd734e13cf97ec0a1d0b42 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/343-0b70d235-44ef-4b15-ae61-65032e4f6cb6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/344-fe37e5f0-80fb-4783-a967-3410282e9767.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/344-fe37e5f0-80fb-4783-a967-3410282e9767.txn new file mode 100644 index 0000000000000000000000000000000000000000..97c441c90540abdefcbaff02ea5ad6d9ac3f5bf3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/344-fe37e5f0-80fb-4783-a967-3410282e9767.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/345-4d121c38-31f3-41f4-96d4-bf9a69aa212a.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/345-4d121c38-31f3-41f4-96d4-bf9a69aa212a.txn new file mode 100644 index 0000000000000000000000000000000000000000..a44b2598c9a2144c45fee27ec3a85282d31bc551 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/345-4d121c38-31f3-41f4-96d4-bf9a69aa212a.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/346-a7fb0de2-48cb-4504-99cf-f1460286eb83.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/346-a7fb0de2-48cb-4504-99cf-f1460286eb83.txn new file mode 100644 index 0000000000000000000000000000000000000000..fdc05b241b3eca92668e28dcf71b974921919e36 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/346-a7fb0de2-48cb-4504-99cf-f1460286eb83.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/347-1872c329-932d-4f88-8479-96c4940f17c0.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/347-1872c329-932d-4f88-8479-96c4940f17c0.txn new file mode 100644 index 0000000000000000000000000000000000000000..d23cc27daf95c91425510347cf74a599ee6ee1cf Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/347-1872c329-932d-4f88-8479-96c4940f17c0.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/348-7033e82c-ec8b-4668-8261-54c34c504f20.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/348-7033e82c-ec8b-4668-8261-54c34c504f20.txn new file mode 100644 index 0000000000000000000000000000000000000000..5deca1a9f5517be0965d36904e108779dafa2653 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/348-7033e82c-ec8b-4668-8261-54c34c504f20.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/349-6b0965ee-aebe-4bc1-acc0-ccaab1eb83ab.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/349-6b0965ee-aebe-4bc1-acc0-ccaab1eb83ab.txn new file mode 100644 index 0000000000000000000000000000000000000000..b07ff01301770ffd76fd0e58508eb0d6d7d0460a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/349-6b0965ee-aebe-4bc1-acc0-ccaab1eb83ab.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/35-34e2f94a-8bb9-4380-8ab3-5b18e85c2c74.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/35-34e2f94a-8bb9-4380-8ab3-5b18e85c2c74.txn new file mode 100644 index 0000000000000000000000000000000000000000..187b14407692e58f637797f1999724bc1fea2b1c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/35-34e2f94a-8bb9-4380-8ab3-5b18e85c2c74.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/350-479a8ffa-8749-4495-a0f5-e909ebdfedec.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/350-479a8ffa-8749-4495-a0f5-e909ebdfedec.txn new file mode 100644 index 0000000000000000000000000000000000000000..15ee2ce29de0591bd0dd7450a6b6ea121750d498 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/350-479a8ffa-8749-4495-a0f5-e909ebdfedec.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/351-55aba971-32f1-4958-9667-e515f792bacc.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/351-55aba971-32f1-4958-9667-e515f792bacc.txn new file mode 100644 index 0000000000000000000000000000000000000000..5c1598c7df4425c62ad62f09e10725c430e6aaa3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/351-55aba971-32f1-4958-9667-e515f792bacc.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/352-a0358b7c-4bee-476e-b1dd-8528cd352961.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/352-a0358b7c-4bee-476e-b1dd-8528cd352961.txn new file mode 100644 index 0000000000000000000000000000000000000000..e78bc58a09cebca13a70d5efca6d52cf747e7308 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/352-a0358b7c-4bee-476e-b1dd-8528cd352961.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/353-21137f8a-b067-42e2-a726-68e9b47e2a96.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/353-21137f8a-b067-42e2-a726-68e9b47e2a96.txn new file mode 100644 index 0000000000000000000000000000000000000000..46321282b04dd7a7ce7e279ca17830165d8e9756 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/353-21137f8a-b067-42e2-a726-68e9b47e2a96.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/354-969d750a-9c0a-484a-8bc5-a3dfbdf8bc75.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/354-969d750a-9c0a-484a-8bc5-a3dfbdf8bc75.txn new file mode 100644 index 0000000000000000000000000000000000000000..00ae75a79d62c625995b3843bba39199a5307c0f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/354-969d750a-9c0a-484a-8bc5-a3dfbdf8bc75.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/355-452ee4b2-a601-469f-b99f-3dce1fb2b39d.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/355-452ee4b2-a601-469f-b99f-3dce1fb2b39d.txn new file mode 100644 index 0000000000000000000000000000000000000000..f4381ae27a6d408f88e60e70cf8f0f22b9ea985e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/355-452ee4b2-a601-469f-b99f-3dce1fb2b39d.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/356-b365021a-f35e-4f09-adf5-0c7f16c21e3d.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/356-b365021a-f35e-4f09-adf5-0c7f16c21e3d.txn new file mode 100644 index 0000000000000000000000000000000000000000..aa2bfdbe1d014d61a670a3adfd83bd7cee92bc26 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/356-b365021a-f35e-4f09-adf5-0c7f16c21e3d.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/357-8ee89972-6955-434d-9711-3e789a5a12b2.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/357-8ee89972-6955-434d-9711-3e789a5a12b2.txn new file mode 100644 index 0000000000000000000000000000000000000000..2831c31fe58e04ec3809f26dc931641ffd0120f0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/357-8ee89972-6955-434d-9711-3e789a5a12b2.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/358-c06e3b79-b1d7-4420-9f8e-a63dd62ce1ac.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/358-c06e3b79-b1d7-4420-9f8e-a63dd62ce1ac.txn new file mode 100644 index 0000000000000000000000000000000000000000..b359e93f1710f6babc5b6a4df46789df5b31f842 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/358-c06e3b79-b1d7-4420-9f8e-a63dd62ce1ac.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/359-6c462893-292c-4edb-989a-24debbf51e60.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/359-6c462893-292c-4edb-989a-24debbf51e60.txn new file mode 100644 index 0000000000000000000000000000000000000000..9791add4a0c44f0a3d194a2aa9f912f49d2326e0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/359-6c462893-292c-4edb-989a-24debbf51e60.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/36-67edddde-8f6a-4d88-85a1-b1082c9d492c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/36-67edddde-8f6a-4d88-85a1-b1082c9d492c.txn new file mode 100644 index 0000000000000000000000000000000000000000..f4c98441677247ad415c2cd6bad241c140f778a9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/36-67edddde-8f6a-4d88-85a1-b1082c9d492c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/360-ee2c0ef4-3145-42c8-a370-35e9b2e66b70.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/360-ee2c0ef4-3145-42c8-a370-35e9b2e66b70.txn new file mode 100644 index 0000000000000000000000000000000000000000..4d8b91ad1d6624ad35c599291a4731242a96147c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/360-ee2c0ef4-3145-42c8-a370-35e9b2e66b70.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/361-d5e6c101-a404-42ac-9bb9-73aeb24e67d7.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/361-d5e6c101-a404-42ac-9bb9-73aeb24e67d7.txn new file mode 100644 index 0000000000000000000000000000000000000000..c055ce0181b28975994c0e97a9abb26318165ee3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/361-d5e6c101-a404-42ac-9bb9-73aeb24e67d7.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/362-89f3d1ea-2de1-4ee1-8392-1bdfd7d4d38e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/362-89f3d1ea-2de1-4ee1-8392-1bdfd7d4d38e.txn new file mode 100644 index 0000000000000000000000000000000000000000..01490d43036add365ba4644e6a0829dd9c368339 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/362-89f3d1ea-2de1-4ee1-8392-1bdfd7d4d38e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/363-f6c8c77a-cb47-43c2-a543-05285bdd940e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/363-f6c8c77a-cb47-43c2-a543-05285bdd940e.txn new file mode 100644 index 0000000000000000000000000000000000000000..3259ed666e1cdf9738bd8734eb71d09343ad3027 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/363-f6c8c77a-cb47-43c2-a543-05285bdd940e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/364-70fe9e43-a973-40b8-a616-4725f281b23d.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/364-70fe9e43-a973-40b8-a616-4725f281b23d.txn new file mode 100644 index 0000000000000000000000000000000000000000..a75de2f6445ca2cdadbe01a0f201bb71f9a6dc85 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/364-70fe9e43-a973-40b8-a616-4725f281b23d.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/365-990ec7b5-acfe-4fd6-a64f-7349fbb07cf6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/365-990ec7b5-acfe-4fd6-a64f-7349fbb07cf6.txn new file mode 100644 index 0000000000000000000000000000000000000000..3aef87bd30f7e0a61b866818dd46244ddfdf737f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/365-990ec7b5-acfe-4fd6-a64f-7349fbb07cf6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/366-213935cc-f06b-4b70-aacc-550415f2ddcb.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/366-213935cc-f06b-4b70-aacc-550415f2ddcb.txn new file mode 100644 index 0000000000000000000000000000000000000000..7d3423e908c4bb060ff954213f3c3b755d8d9d59 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/366-213935cc-f06b-4b70-aacc-550415f2ddcb.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/367-e9eca6a9-4932-45ca-9025-b34963778e39.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/367-e9eca6a9-4932-45ca-9025-b34963778e39.txn new file mode 100644 index 0000000000000000000000000000000000000000..4b990f7c97170d501b3be30aedbc19d34676917f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/367-e9eca6a9-4932-45ca-9025-b34963778e39.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/368-7c93643e-86f7-4793-91cc-3b01ea81939c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/368-7c93643e-86f7-4793-91cc-3b01ea81939c.txn new file mode 100644 index 0000000000000000000000000000000000000000..3a7d8b69e42cd95c236ef10f71e90829b2bd568c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/368-7c93643e-86f7-4793-91cc-3b01ea81939c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/369-6a4cf1e8-19ab-4d4c-b068-b9369941b0f7.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/369-6a4cf1e8-19ab-4d4c-b068-b9369941b0f7.txn new file mode 100644 index 0000000000000000000000000000000000000000..095329dd6ea05a06b5439f919623ba6b60236993 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/369-6a4cf1e8-19ab-4d4c-b068-b9369941b0f7.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/37-6adc8593-9729-4aae-8a59-bc72f5b02353.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/37-6adc8593-9729-4aae-8a59-bc72f5b02353.txn new file mode 100644 index 0000000000000000000000000000000000000000..a283bda3b7a4c2826c9d7d87fb2ffafa57eb1f1b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/37-6adc8593-9729-4aae-8a59-bc72f5b02353.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/370-13a38ba2-6d9d-4d50-9d2f-85b17b4fea7d.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/370-13a38ba2-6d9d-4d50-9d2f-85b17b4fea7d.txn new file mode 100644 index 0000000000000000000000000000000000000000..033d59c02d855ff2ebea356964b170cf917b5511 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/370-13a38ba2-6d9d-4d50-9d2f-85b17b4fea7d.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/371-17d5e4dd-cff2-4bad-9ab0-c4f9289a3c7b.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/371-17d5e4dd-cff2-4bad-9ab0-c4f9289a3c7b.txn new file mode 100644 index 0000000000000000000000000000000000000000..0b055cdeb35c6c5d58c7c4287d5f9bc42e38c0f2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/371-17d5e4dd-cff2-4bad-9ab0-c4f9289a3c7b.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/372-df1a7da9-0ec3-48d6-b4c8-8117adda1bb5.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/372-df1a7da9-0ec3-48d6-b4c8-8117adda1bb5.txn new file mode 100644 index 0000000000000000000000000000000000000000..c6c226e504aa5828a642dae5a6d7b576c6a23296 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/372-df1a7da9-0ec3-48d6-b4c8-8117adda1bb5.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/373-720dac05-243d-42e8-91ae-2320923d14b4.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/373-720dac05-243d-42e8-91ae-2320923d14b4.txn new file mode 100644 index 0000000000000000000000000000000000000000..8067c7d7fefa44a2cc01e0f57602927d8e2dc2d3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/373-720dac05-243d-42e8-91ae-2320923d14b4.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/374-01c7024c-2223-45f6-bc65-91afc6b2d721.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/374-01c7024c-2223-45f6-bc65-91afc6b2d721.txn new file mode 100644 index 0000000000000000000000000000000000000000..52a727b8f5f69a1917fe9522cd091b40129f1b04 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/374-01c7024c-2223-45f6-bc65-91afc6b2d721.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/375-b73eac4d-336e-4135-8226-6e65064cc08e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/375-b73eac4d-336e-4135-8226-6e65064cc08e.txn new file mode 100644 index 0000000000000000000000000000000000000000..85e2017319ac11c4e0567e5e28a14b31db3fd73f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/375-b73eac4d-336e-4135-8226-6e65064cc08e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/376-c68ac0fb-cded-491b-b392-31b02d65c5a5.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/376-c68ac0fb-cded-491b-b392-31b02d65c5a5.txn new file mode 100644 index 0000000000000000000000000000000000000000..dbb8a2b94f928fbe1b2a075cfafed6df98769d83 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/376-c68ac0fb-cded-491b-b392-31b02d65c5a5.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/377-4e4a91a8-5984-478a-9905-40894e3ddec6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/377-4e4a91a8-5984-478a-9905-40894e3ddec6.txn new file mode 100644 index 0000000000000000000000000000000000000000..63a026976fd51e03eb91bade0d22f1e403ead9d9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/377-4e4a91a8-5984-478a-9905-40894e3ddec6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/378-61dab22f-8e0c-4720-9587-8f29b70305f6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/378-61dab22f-8e0c-4720-9587-8f29b70305f6.txn new file mode 100644 index 0000000000000000000000000000000000000000..fcfd94eae1429aea4ed5b69b8726346be10d9a2b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/378-61dab22f-8e0c-4720-9587-8f29b70305f6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/379-70909d02-cbf8-45e0-815b-2f64c6b652a7.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/379-70909d02-cbf8-45e0-815b-2f64c6b652a7.txn new file mode 100644 index 0000000000000000000000000000000000000000..2134885f29c7f626842b5c0663d435c82fc1bc11 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/379-70909d02-cbf8-45e0-815b-2f64c6b652a7.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/38-35d85f71-cd59-4bc6-a03b-9977568d98d9.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/38-35d85f71-cd59-4bc6-a03b-9977568d98d9.txn new file mode 100644 index 0000000000000000000000000000000000000000..a330f511d547866ac696e5a148c2e972c6c01199 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/38-35d85f71-cd59-4bc6-a03b-9977568d98d9.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/380-a105eb55-70e5-410f-b8c3-6566f9d0f8fd.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/380-a105eb55-70e5-410f-b8c3-6566f9d0f8fd.txn new file mode 100644 index 0000000000000000000000000000000000000000..e05d5923f6ac654b18ebee2dab3b7604e2e11d40 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/380-a105eb55-70e5-410f-b8c3-6566f9d0f8fd.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/381-e91a076c-b8c5-4304-b7cd-f7b01068d159.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/381-e91a076c-b8c5-4304-b7cd-f7b01068d159.txn new file mode 100644 index 0000000000000000000000000000000000000000..db2ac1b6c6092decb2d06d7e593935f3797c6e9e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/381-e91a076c-b8c5-4304-b7cd-f7b01068d159.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/382-41b1a65b-61f6-4b0b-b8c5-460517be7f21.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/382-41b1a65b-61f6-4b0b-b8c5-460517be7f21.txn new file mode 100644 index 0000000000000000000000000000000000000000..efc88bc3b448484f79c9127e10185c513810a639 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/382-41b1a65b-61f6-4b0b-b8c5-460517be7f21.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/383-5b47d11b-f76b-444e-ac76-137462a9cf84.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/383-5b47d11b-f76b-444e-ac76-137462a9cf84.txn new file mode 100644 index 0000000000000000000000000000000000000000..e542afe09de4624626b652aba21b3ecfc245ae5d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/383-5b47d11b-f76b-444e-ac76-137462a9cf84.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/384-5f17d45f-6353-4369-a75e-9ebab55fda3d.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/384-5f17d45f-6353-4369-a75e-9ebab55fda3d.txn new file mode 100644 index 0000000000000000000000000000000000000000..a355d7ea703e3b19def44841f358505ddb83b56f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/384-5f17d45f-6353-4369-a75e-9ebab55fda3d.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/385-d8421812-3120-437a-9ca4-1a6110dbbe57.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/385-d8421812-3120-437a-9ca4-1a6110dbbe57.txn new file mode 100644 index 0000000000000000000000000000000000000000..d82e1409edd0bd114fe20cfc8455fb323809fc16 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/385-d8421812-3120-437a-9ca4-1a6110dbbe57.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/386-b6c53378-6cb1-4fe7-8130-11ebaabfa271.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/386-b6c53378-6cb1-4fe7-8130-11ebaabfa271.txn new file mode 100644 index 0000000000000000000000000000000000000000..02871cdad62c0e1e9029e145f11436d864a50631 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/386-b6c53378-6cb1-4fe7-8130-11ebaabfa271.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/387-d505f05e-830e-4d9a-b0d6-ed654acb3ed9.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/387-d505f05e-830e-4d9a-b0d6-ed654acb3ed9.txn new file mode 100644 index 0000000000000000000000000000000000000000..8989a2d434b80957740991893cf47b2f51e8f211 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/387-d505f05e-830e-4d9a-b0d6-ed654acb3ed9.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/388-b53b9b6c-7aa2-4772-af5b-74a8aecaee57.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/388-b53b9b6c-7aa2-4772-af5b-74a8aecaee57.txn new file mode 100644 index 0000000000000000000000000000000000000000..e4b8335a923ba2c5a92ae8401c4edbc33e637e96 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/388-b53b9b6c-7aa2-4772-af5b-74a8aecaee57.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/389-72e8003e-61ae-463f-9241-8294655b8117.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/389-72e8003e-61ae-463f-9241-8294655b8117.txn new file mode 100644 index 0000000000000000000000000000000000000000..1a3c77e0e2a51a7a1be8f4eff8a9f7234b18f3f2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/389-72e8003e-61ae-463f-9241-8294655b8117.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/39-4dfb88d2-698f-410b-9dfc-673cc08624c4.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/39-4dfb88d2-698f-410b-9dfc-673cc08624c4.txn new file mode 100644 index 0000000000000000000000000000000000000000..f0333b2259f9ef35949305db9c3d1b1692c56005 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/39-4dfb88d2-698f-410b-9dfc-673cc08624c4.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/390-66951b94-d527-4e9a-a0ae-62b61ef873bc.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/390-66951b94-d527-4e9a-a0ae-62b61ef873bc.txn new file mode 100644 index 0000000000000000000000000000000000000000..fdc4c6e0df2d8b749e6f25aee764df639a127f77 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/390-66951b94-d527-4e9a-a0ae-62b61ef873bc.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/391-9075ac38-e085-4808-a0c8-2ca8380a9504.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/391-9075ac38-e085-4808-a0c8-2ca8380a9504.txn new file mode 100644 index 0000000000000000000000000000000000000000..09581de3e06febb73f5fdb29a647f2c92bb6dec0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/391-9075ac38-e085-4808-a0c8-2ca8380a9504.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/392-2f0b08df-0c47-4955-93e8-614aa13be0c3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/392-2f0b08df-0c47-4955-93e8-614aa13be0c3.txn new file mode 100644 index 0000000000000000000000000000000000000000..c9b28184d8ad90e38546d451e588787f167b2397 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/392-2f0b08df-0c47-4955-93e8-614aa13be0c3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/393-a0246a36-20e0-4a4e-a06a-56f306f9a2aa.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/393-a0246a36-20e0-4a4e-a06a-56f306f9a2aa.txn new file mode 100644 index 0000000000000000000000000000000000000000..fbda35caf5232a2b2c45e6ddfe44d4a2f9b33375 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/393-a0246a36-20e0-4a4e-a06a-56f306f9a2aa.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/394-c58300c7-5c09-45b8-80d8-0e80573011b9.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/394-c58300c7-5c09-45b8-80d8-0e80573011b9.txn new file mode 100644 index 0000000000000000000000000000000000000000..62295a8bad7aeb15a4518091796aa950cb960cc3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/394-c58300c7-5c09-45b8-80d8-0e80573011b9.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/4-082fdc4d-8e04-4383-87d4-bf2009ff6167.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/4-082fdc4d-8e04-4383-87d4-bf2009ff6167.txn new file mode 100644 index 0000000000000000000000000000000000000000..fb7b1e5305152e0b493888d143cbe7c4a851da4a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/4-082fdc4d-8e04-4383-87d4-bf2009ff6167.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/40-b8a38063-4e98-42ab-877b-8c56ffbe4130.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/40-b8a38063-4e98-42ab-877b-8c56ffbe4130.txn new file mode 100644 index 0000000000000000000000000000000000000000..d92492046593b5fae9d695e7fbe97db965a99bbf Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/40-b8a38063-4e98-42ab-877b-8c56ffbe4130.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/41-b2293205-1e36-4ac2-90a2-faa796e383a6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/41-b2293205-1e36-4ac2-90a2-faa796e383a6.txn new file mode 100644 index 0000000000000000000000000000000000000000..3aacf301639d1259cf6fcf802a62d66306f2dc31 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/41-b2293205-1e36-4ac2-90a2-faa796e383a6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/42-2b0273fe-e801-44f5-843c-e6e309a1dc7d.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/42-2b0273fe-e801-44f5-843c-e6e309a1dc7d.txn new file mode 100644 index 0000000000000000000000000000000000000000..016c50d90ce547da5c53324711ed09d45b3a8837 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/42-2b0273fe-e801-44f5-843c-e6e309a1dc7d.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/43-bc3a33c1-4273-489d-893c-a621283441ca.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/43-bc3a33c1-4273-489d-893c-a621283441ca.txn new file mode 100644 index 0000000000000000000000000000000000000000..e7a36cffb6a64a5a8e7e2684fcf03865f3d2deb6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/43-bc3a33c1-4273-489d-893c-a621283441ca.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/44-2df4da43-309f-4db0-8021-e76f21db26f7.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/44-2df4da43-309f-4db0-8021-e76f21db26f7.txn new file mode 100644 index 0000000000000000000000000000000000000000..7a2b4b0037ba8dc19714f30bcb32672b5cc4f0a7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/44-2df4da43-309f-4db0-8021-e76f21db26f7.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/45-202b2f8a-9509-451b-9ee2-c0e0458a0316.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/45-202b2f8a-9509-451b-9ee2-c0e0458a0316.txn new file mode 100644 index 0000000000000000000000000000000000000000..deabf5eff809d8721699f6fabad344990f5a0cc7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/45-202b2f8a-9509-451b-9ee2-c0e0458a0316.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/46-a8d39ce1-6573-4307-96f4-8ed324fd85d5.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/46-a8d39ce1-6573-4307-96f4-8ed324fd85d5.txn new file mode 100644 index 0000000000000000000000000000000000000000..37e1444ef1a807288add10116beb3afc6f84a2f2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/46-a8d39ce1-6573-4307-96f4-8ed324fd85d5.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/47-7f3855ed-1af7-44b5-abc8-224683c363d4.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/47-7f3855ed-1af7-44b5-abc8-224683c363d4.txn new file mode 100644 index 0000000000000000000000000000000000000000..fc2ed526b8d81fbf251ee6e836ca86b877af755b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/47-7f3855ed-1af7-44b5-abc8-224683c363d4.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/48-7fe1a867-77f8-4c26-be3e-b50a43ba82d7.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/48-7fe1a867-77f8-4c26-be3e-b50a43ba82d7.txn new file mode 100644 index 0000000000000000000000000000000000000000..54dd6abd561cbd6a1f4a18bd98e97b84b2787f2b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/48-7fe1a867-77f8-4c26-be3e-b50a43ba82d7.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/49-5444972a-8a75-4f85-bacb-0bfa0fb84acd.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/49-5444972a-8a75-4f85-bacb-0bfa0fb84acd.txn new file mode 100644 index 0000000000000000000000000000000000000000..81426bc8c39403b02e3fd8d39b85466c4ea7408f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/49-5444972a-8a75-4f85-bacb-0bfa0fb84acd.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/5-cfc0e5cf-cb42-4683-ab3c-5f00131643c0.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/5-cfc0e5cf-cb42-4683-ab3c-5f00131643c0.txn new file mode 100644 index 0000000000000000000000000000000000000000..8eb6014f30e73a6b5c38a2ce3687dea2cc7b13c8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/5-cfc0e5cf-cb42-4683-ab3c-5f00131643c0.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/50-c4de90c0-c5ba-4b02-ba13-b84d12a77df6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/50-c4de90c0-c5ba-4b02-ba13-b84d12a77df6.txn new file mode 100644 index 0000000000000000000000000000000000000000..3f3beea8a08eb2cca89d8552f4ce4e7e587a3c61 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/50-c4de90c0-c5ba-4b02-ba13-b84d12a77df6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/51-187a0685-3c04-4086-848e-65fa0c43d030.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/51-187a0685-3c04-4086-848e-65fa0c43d030.txn new file mode 100644 index 0000000000000000000000000000000000000000..6915a5ded62b26a2fb7083918a90c65a6a3184b5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/51-187a0685-3c04-4086-848e-65fa0c43d030.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/52-8861ed9a-4615-4682-b8d8-add3311bdfb4.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/52-8861ed9a-4615-4682-b8d8-add3311bdfb4.txn new file mode 100644 index 0000000000000000000000000000000000000000..778039399fa637db19f9ed61bd2aa4f0e164b731 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/52-8861ed9a-4615-4682-b8d8-add3311bdfb4.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/53-4c60c928-7bfc-4e44-a30e-9a5971af45a3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/53-4c60c928-7bfc-4e44-a30e-9a5971af45a3.txn new file mode 100644 index 0000000000000000000000000000000000000000..ce062bf8906b94b2142bc2c03d5ec219d06f7e36 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/53-4c60c928-7bfc-4e44-a30e-9a5971af45a3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/54-a5c6d605-6bd4-46ae-830c-2f7ab6196e2e.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/54-a5c6d605-6bd4-46ae-830c-2f7ab6196e2e.txn new file mode 100644 index 0000000000000000000000000000000000000000..54bb26a6942825d318dd285cb282d00b75ecffe8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/54-a5c6d605-6bd4-46ae-830c-2f7ab6196e2e.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/55-fe77d1d5-9287-4d4b-bd8b-95bfe4b8af18.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/55-fe77d1d5-9287-4d4b-bd8b-95bfe4b8af18.txn new file mode 100644 index 0000000000000000000000000000000000000000..4a758da78a5a6e1b32f24e1fc31f655c5e216cbd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/55-fe77d1d5-9287-4d4b-bd8b-95bfe4b8af18.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/56-3131b3e8-3084-43cb-b4c7-a1864e194a25.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/56-3131b3e8-3084-43cb-b4c7-a1864e194a25.txn new file mode 100644 index 0000000000000000000000000000000000000000..3caa8f85241c36671dcde03f7734b1e3476e4444 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/56-3131b3e8-3084-43cb-b4c7-a1864e194a25.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/57-30a12ea3-15cd-477a-80ad-d69d689e73bb.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/57-30a12ea3-15cd-477a-80ad-d69d689e73bb.txn new file mode 100644 index 0000000000000000000000000000000000000000..f1fb33337ff255a1180b3eee5fa9c2a81577723c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/57-30a12ea3-15cd-477a-80ad-d69d689e73bb.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/58-c3c1016e-6317-472a-99e7-7ce8cd35af6c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/58-c3c1016e-6317-472a-99e7-7ce8cd35af6c.txn new file mode 100644 index 0000000000000000000000000000000000000000..3e2b72c11cb3ead6e775fd92c9a22549c1ec1571 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/58-c3c1016e-6317-472a-99e7-7ce8cd35af6c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/59-4b9d9f2c-8ba9-4f7c-a23d-0142d154be5b.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/59-4b9d9f2c-8ba9-4f7c-a23d-0142d154be5b.txn new file mode 100644 index 0000000000000000000000000000000000000000..6be664d104256dcc26f9bbf1b817210c7b5d3bc4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/59-4b9d9f2c-8ba9-4f7c-a23d-0142d154be5b.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/6-fb84480d-fb8b-4e86-8dc3-89dc333481e0.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/6-fb84480d-fb8b-4e86-8dc3-89dc333481e0.txn new file mode 100644 index 0000000000000000000000000000000000000000..4c178b9cfcd7b4541a796b928de4c9bab7a2f3ec Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/6-fb84480d-fb8b-4e86-8dc3-89dc333481e0.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/60-ad0512de-5ccc-4a3f-8080-cda065be7712.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/60-ad0512de-5ccc-4a3f-8080-cda065be7712.txn new file mode 100644 index 0000000000000000000000000000000000000000..80edf83bdaf3a321415e56a807e0e31b9563b8a4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/60-ad0512de-5ccc-4a3f-8080-cda065be7712.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/61-94d0204c-521f-445f-8cc8-084f06779031.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/61-94d0204c-521f-445f-8cc8-084f06779031.txn new file mode 100644 index 0000000000000000000000000000000000000000..32921544e9355f420b44a20d0684ee7d61a8876a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/61-94d0204c-521f-445f-8cc8-084f06779031.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/62-c51d8ff4-5589-4139-8e35-4c1e4257af86.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/62-c51d8ff4-5589-4139-8e35-4c1e4257af86.txn new file mode 100644 index 0000000000000000000000000000000000000000..26800b920f69c1780f170edeaf28964eeb2cb661 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/62-c51d8ff4-5589-4139-8e35-4c1e4257af86.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/63-e0be43bb-f1f2-47a2-a347-14a52444a963.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/63-e0be43bb-f1f2-47a2-a347-14a52444a963.txn new file mode 100644 index 0000000000000000000000000000000000000000..9ae64ce25d7c0c24e6e65f691eb50667e38eaad1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/63-e0be43bb-f1f2-47a2-a347-14a52444a963.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/64-cc285900-f118-4a79-b250-68f1672525d5.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/64-cc285900-f118-4a79-b250-68f1672525d5.txn new file mode 100644 index 0000000000000000000000000000000000000000..eec300a57afe29828bee3b3e349674577b4066d8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/64-cc285900-f118-4a79-b250-68f1672525d5.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/65-9f8d5528-9a64-4706-bbbc-247c5b6d0d19.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/65-9f8d5528-9a64-4706-bbbc-247c5b6d0d19.txn new file mode 100644 index 0000000000000000000000000000000000000000..7d4bad05193e3bc2eee11b3ac238ab56605fe528 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/65-9f8d5528-9a64-4706-bbbc-247c5b6d0d19.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/66-d0b8556b-7b08-4383-83ef-e1fe547bd7f3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/66-d0b8556b-7b08-4383-83ef-e1fe547bd7f3.txn new file mode 100644 index 0000000000000000000000000000000000000000..a4d8433e2f916203f4f7a3c8e95457cabb4d7ddd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/66-d0b8556b-7b08-4383-83ef-e1fe547bd7f3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/67-589538bd-9d1f-40b6-bf9b-dbc8123ac2a7.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/67-589538bd-9d1f-40b6-bf9b-dbc8123ac2a7.txn new file mode 100644 index 0000000000000000000000000000000000000000..5605419c6b4dc03a63883fa7de072a6bbf88c162 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/67-589538bd-9d1f-40b6-bf9b-dbc8123ac2a7.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/68-14089833-71e5-4ac0-9bab-02f300bc4049.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/68-14089833-71e5-4ac0-9bab-02f300bc4049.txn new file mode 100644 index 0000000000000000000000000000000000000000..022f01022a8eead2a8ec2e652798b88c8b475a26 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/68-14089833-71e5-4ac0-9bab-02f300bc4049.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/69-38b04a3e-751a-49b5-9da1-416772785c41.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/69-38b04a3e-751a-49b5-9da1-416772785c41.txn new file mode 100644 index 0000000000000000000000000000000000000000..5469752bcb0f33e46f958e81d748d8e2eafa2ba3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/69-38b04a3e-751a-49b5-9da1-416772785c41.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/7-8a40d13d-4ae7-48dc-8e75-e21cbdadc779.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/7-8a40d13d-4ae7-48dc-8e75-e21cbdadc779.txn new file mode 100644 index 0000000000000000000000000000000000000000..23e75a662289151290502a1f64278939c8dc4f63 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/7-8a40d13d-4ae7-48dc-8e75-e21cbdadc779.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/70-baabc80a-9edb-4b55-b0c3-2bc1f62e31bb.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/70-baabc80a-9edb-4b55-b0c3-2bc1f62e31bb.txn new file mode 100644 index 0000000000000000000000000000000000000000..aa38eaef1717bc0060385199be597d5eb5bd5b34 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/70-baabc80a-9edb-4b55-b0c3-2bc1f62e31bb.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/71-697b8b50-adb1-4d3b-85ba-c2f3b31920e5.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/71-697b8b50-adb1-4d3b-85ba-c2f3b31920e5.txn new file mode 100644 index 0000000000000000000000000000000000000000..7aa39554a4a32749b5938ea5a7d1d141ef3e53af Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/71-697b8b50-adb1-4d3b-85ba-c2f3b31920e5.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/72-28aec803-40df-4098-a0ed-ab9fb2fe9093.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/72-28aec803-40df-4098-a0ed-ab9fb2fe9093.txn new file mode 100644 index 0000000000000000000000000000000000000000..5ab0616d92298a0aa09a864d65fffe2f6ea4fd0d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/72-28aec803-40df-4098-a0ed-ab9fb2fe9093.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/73-b9f44afd-4e44-4cea-8b75-37a3323b6eef.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/73-b9f44afd-4e44-4cea-8b75-37a3323b6eef.txn new file mode 100644 index 0000000000000000000000000000000000000000..5141a052a785111186d0419ebecd80d2739af898 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/73-b9f44afd-4e44-4cea-8b75-37a3323b6eef.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/74-edcd23a6-8933-4ed0-87ae-844187bad209.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/74-edcd23a6-8933-4ed0-87ae-844187bad209.txn new file mode 100644 index 0000000000000000000000000000000000000000..650896d8ac98ba659b04993a12f1f3792a2d7a60 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/74-edcd23a6-8933-4ed0-87ae-844187bad209.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/75-f230805b-343e-476d-ab6b-4de5c1c94fa9.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/75-f230805b-343e-476d-ab6b-4de5c1c94fa9.txn new file mode 100644 index 0000000000000000000000000000000000000000..0d89f71d5f6a9464bee37e28b88af106fb270c0b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/75-f230805b-343e-476d-ab6b-4de5c1c94fa9.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/76-a88d4d31-4fce-4a9b-9413-9cecb8e0ce38.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/76-a88d4d31-4fce-4a9b-9413-9cecb8e0ce38.txn new file mode 100644 index 0000000000000000000000000000000000000000..3a3487b62e3f39d85096d773f81fae87b8cfde73 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/76-a88d4d31-4fce-4a9b-9413-9cecb8e0ce38.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/77-f4c8e0b3-3aef-4a8f-bd76-19b5714f80ff.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/77-f4c8e0b3-3aef-4a8f-bd76-19b5714f80ff.txn new file mode 100644 index 0000000000000000000000000000000000000000..bc3b91188d7e27311d537dc8c09ced79e8b1f8a7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/77-f4c8e0b3-3aef-4a8f-bd76-19b5714f80ff.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/78-43c20dc7-515b-4f18-8b82-3a4da0367072.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/78-43c20dc7-515b-4f18-8b82-3a4da0367072.txn new file mode 100644 index 0000000000000000000000000000000000000000..c212d2de58503897d7d7fdc62f453f3b6b845d90 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/78-43c20dc7-515b-4f18-8b82-3a4da0367072.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/79-dad2dac5-4612-4fe7-91dc-ff6570aae416.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/79-dad2dac5-4612-4fe7-91dc-ff6570aae416.txn new file mode 100644 index 0000000000000000000000000000000000000000..0e839f74a9de09e6c22a9a66f8c8b655a2adde71 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/79-dad2dac5-4612-4fe7-91dc-ff6570aae416.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/8-b4b0ba71-3975-4464-ae74-6a7683ad6f95.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/8-b4b0ba71-3975-4464-ae74-6a7683ad6f95.txn new file mode 100644 index 0000000000000000000000000000000000000000..2189743a574e612109d1218e3b376bf2fe527d1b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/8-b4b0ba71-3975-4464-ae74-6a7683ad6f95.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/80-935f5615-63f3-4da0-85f3-528f3e40d0e9.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/80-935f5615-63f3-4da0-85f3-528f3e40d0e9.txn new file mode 100644 index 0000000000000000000000000000000000000000..98ce8bf145060d71e34743a1c7cd1868d69a4352 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/80-935f5615-63f3-4da0-85f3-528f3e40d0e9.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/81-89d76500-6924-40d3-806e-e37d22a57db6.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/81-89d76500-6924-40d3-806e-e37d22a57db6.txn new file mode 100644 index 0000000000000000000000000000000000000000..f4d63fa762b8d4a7128c42d9a34b8aa0a2d0591d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/81-89d76500-6924-40d3-806e-e37d22a57db6.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/82-f91e0c5b-b409-44c6-900a-00f2ee551d95.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/82-f91e0c5b-b409-44c6-900a-00f2ee551d95.txn new file mode 100644 index 0000000000000000000000000000000000000000..7b461caaca8bff1ec95e7de01f43c9b3c5d8b3fe Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/82-f91e0c5b-b409-44c6-900a-00f2ee551d95.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/83-c52f2465-85d3-48b2-93d6-68f0055c3a80.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/83-c52f2465-85d3-48b2-93d6-68f0055c3a80.txn new file mode 100644 index 0000000000000000000000000000000000000000..0fc737a1def222826faa1b1797faf2f3de327637 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/83-c52f2465-85d3-48b2-93d6-68f0055c3a80.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/84-308659ec-a548-4210-a12b-d4b96a5d98f9.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/84-308659ec-a548-4210-a12b-d4b96a5d98f9.txn new file mode 100644 index 0000000000000000000000000000000000000000..cb610805dcdcd1ab78f0af8b7e29af5cd700c4fa Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/84-308659ec-a548-4210-a12b-d4b96a5d98f9.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/85-1117d918-2f84-4e9f-9f5d-5922950999ab.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/85-1117d918-2f84-4e9f-9f5d-5922950999ab.txn new file mode 100644 index 0000000000000000000000000000000000000000..5bcf39c8e9afd11004374ef7b5b17079b274f0fa Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/85-1117d918-2f84-4e9f-9f5d-5922950999ab.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/86-3d402bf5-8a33-4573-8088-5af721cab406.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/86-3d402bf5-8a33-4573-8088-5af721cab406.txn new file mode 100644 index 0000000000000000000000000000000000000000..8c70caead88facff1bc7e68ca71e3914f2fe0ae3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/86-3d402bf5-8a33-4573-8088-5af721cab406.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/87-a72fae78-2dd0-43ee-a54b-ba7879cc5f83.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/87-a72fae78-2dd0-43ee-a54b-ba7879cc5f83.txn new file mode 100644 index 0000000000000000000000000000000000000000..467c3f6098d6a5e6326442dd294cca4a4207dc18 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/87-a72fae78-2dd0-43ee-a54b-ba7879cc5f83.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/88-bdc8ce17-a68b-4bdf-9237-390183b9dd8c.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/88-bdc8ce17-a68b-4bdf-9237-390183b9dd8c.txn new file mode 100644 index 0000000000000000000000000000000000000000..7a6db81d7d77c40435c119840f75f65d4dcd6ee9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/88-bdc8ce17-a68b-4bdf-9237-390183b9dd8c.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/89-48bb3a05-3751-4c8f-b2d6-174b38c0eeb3.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/89-48bb3a05-3751-4c8f-b2d6-174b38c0eeb3.txn new file mode 100644 index 0000000000000000000000000000000000000000..b1792e73085d766643acf9f7f4323cdc548fd9af Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/89-48bb3a05-3751-4c8f-b2d6-174b38c0eeb3.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/9-62a7a425-bc8c-401f-b747-fef80b74b8ab.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/9-62a7a425-bc8c-401f-b747-fef80b74b8ab.txn new file mode 100644 index 0000000000000000000000000000000000000000..19c0ed896ea903896167e54493d5e98ac443d3f3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/9-62a7a425-bc8c-401f-b747-fef80b74b8ab.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/90-67d8f2ec-46fa-40d8-856a-0af0b7bfe8a1.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/90-67d8f2ec-46fa-40d8-856a-0af0b7bfe8a1.txn new file mode 100644 index 0000000000000000000000000000000000000000..3bed8054be0692aa5c62aaec2063d8c7295bcb1b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/90-67d8f2ec-46fa-40d8-856a-0af0b7bfe8a1.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/91-64fdc61f-6d9c-47f2-85eb-23da74d802ab.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/91-64fdc61f-6d9c-47f2-85eb-23da74d802ab.txn new file mode 100644 index 0000000000000000000000000000000000000000..e178b7d873fa080536488de7e5c17a59ac95f5c6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/91-64fdc61f-6d9c-47f2-85eb-23da74d802ab.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/92-975e9257-d26c-425a-9c58-1c16fc426301.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/92-975e9257-d26c-425a-9c58-1c16fc426301.txn new file mode 100644 index 0000000000000000000000000000000000000000..92faf9e841475b5bda2088b8052cf2474ddea8a3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/92-975e9257-d26c-425a-9c58-1c16fc426301.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/93-734f7fc5-59b1-4495-896c-49fd8c9d3a51.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/93-734f7fc5-59b1-4495-896c-49fd8c9d3a51.txn new file mode 100644 index 0000000000000000000000000000000000000000..0a6d3361e65f04bd1e70a4a4ff87936005b221b0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/93-734f7fc5-59b1-4495-896c-49fd8c9d3a51.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/94-8b9ab9ba-72a8-4121-b5a4-4ed1d6d02001.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/94-8b9ab9ba-72a8-4121-b5a4-4ed1d6d02001.txn new file mode 100644 index 0000000000000000000000000000000000000000..f7d4c803b3745761c51f8e288dcbbf9cfd8dee89 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/94-8b9ab9ba-72a8-4121-b5a4-4ed1d6d02001.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/95-f23b9a84-c339-4548-9d87-31aaff04b478.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/95-f23b9a84-c339-4548-9d87-31aaff04b478.txn new file mode 100644 index 0000000000000000000000000000000000000000..22751ad713628eeca998054f5b4ea8a1e28229a3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/95-f23b9a84-c339-4548-9d87-31aaff04b478.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/96-1d07de51-9393-45c0-bb57-a988fd01cbfa.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/96-1d07de51-9393-45c0-bb57-a988fd01cbfa.txn new file mode 100644 index 0000000000000000000000000000000000000000..68da18ff98bf246be4c4c8462b4ba316b82cd920 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/96-1d07de51-9393-45c0-bb57-a988fd01cbfa.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/97-c33e3a02-5453-4ec9-aacf-34469eb02048.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/97-c33e3a02-5453-4ec9-aacf-34469eb02048.txn new file mode 100644 index 0000000000000000000000000000000000000000..36d30a128f210cac9635d74f30de7bd4268a20c1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/97-c33e3a02-5453-4ec9-aacf-34469eb02048.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/98-5caba406-a2e6-4fa0-afe5-34daba2b3b33.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/98-5caba406-a2e6-4fa0-afe5-34daba2b3b33.txn new file mode 100644 index 0000000000000000000000000000000000000000..a8c1f7446a077c31e7c9a103bb55f800dd161c5e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/98-5caba406-a2e6-4fa0-afe5-34daba2b3b33.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_transactions/99-c8f722b0-97a5-443e-b729-41d972094e38.txn b/.lancedb/nltk_chunking_BAAI.lance/_transactions/99-c8f722b0-97a5-443e-b729-41d972094e38.txn new file mode 100644 index 0000000000000000000000000000000000000000..25c577ffe2a31e660e60abe586e69bb8c8e47034 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_transactions/99-c8f722b0-97a5-443e-b729-41d972094e38.txn differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/1.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/1.manifest new file mode 100644 index 0000000000000000000000000000000000000000..56740f8b6405195a7f8db74655346440da2df94b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/1.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/10.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/10.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d3f5b84559d0ae51aed60a270bdeb41f9d52c976 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/10.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/100.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/100.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8d8a6c51984090ea39ae47361d106e8ca84c83da Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/100.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/101.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/101.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b93f110098dd66c6e110e81078e8addb6b1fa12a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/101.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/102.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/102.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4ce0b223e3066f39bd1c4d94617586f68458c5e5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/102.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/103.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/103.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4cd2032fd16c85200bdd7f9351e194ad8a9058b8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/103.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/104.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/104.manifest new file mode 100644 index 0000000000000000000000000000000000000000..934c813aca16e1259b6ecf3fc289667b8ab5c830 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/104.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/105.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/105.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7bb9019dcef1681919f8186c2d5a76ffad0ebd93 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/105.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/106.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/106.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a75d3c416e6a41587dd4a0d77753145df57447eb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/106.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/107.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/107.manifest new file mode 100644 index 0000000000000000000000000000000000000000..46cae5177a7d6618247ab0a8fc7e41fed5575392 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/107.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/108.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/108.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bc923b7cd8ddc998f14a8d45c09c520a1cd39c30 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/108.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/109.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/109.manifest new file mode 100644 index 0000000000000000000000000000000000000000..de30aa2f580c44748f2d8dde5e9abea2c8095a0f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/109.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/11.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/11.manifest new file mode 100644 index 0000000000000000000000000000000000000000..73f849b11cb0ddeaaafb3ceb4c4d6cfbf697da89 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/11.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/110.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/110.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ca0ad90ff4e5b85aa327aef11496fe79c7350e41 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/110.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/111.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/111.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e2c716703bfc64cdfa38136d66d2853e50e1836a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/111.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/112.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/112.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cbe48a695bcb304d0ce03594f332befee8395f01 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/112.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/113.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/113.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8c3ed06bbbfff51be3eb129ac68f65badefc2410 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/113.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/114.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/114.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3498761dd45026e9e3b2437563bc721a5a6486ee Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/114.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/115.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/115.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c81fc5ce886503c5dea24a2e064a3e4f7e36d7c9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/115.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/116.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/116.manifest new file mode 100644 index 0000000000000000000000000000000000000000..084ec517172d6fd6c75f7daa066b05acb5d1fcab Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/116.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/117.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/117.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c381d3a2bdfb023a8f4fd1425b8b85f1e0008b91 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/117.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/118.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/118.manifest new file mode 100644 index 0000000000000000000000000000000000000000..baff1298b7f9998cfa95135caf8398f063e59444 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/118.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/119.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/119.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7aad6fe902c10277110795795bdbe9ee55f1af33 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/119.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/12.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/12.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2467f31e0c9b2a0f86504ba9859d33b18c4bd876 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/12.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/120.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/120.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dbea6d9f92afefda717e9e07a377ac0f1de2a778 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/120.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/121.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/121.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ffb4390fac8d9af1a8c7aee2fe0fd90b2bd0af5a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/121.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/122.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/122.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4c86f31d43572da9dc6b8e5d88678dde39d49d70 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/122.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/123.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/123.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dbe654be19f09ad53649d609af3def862eedba7a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/123.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/124.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/124.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5fed9bfe9878bd4b2619a8944d0e07b377ce9bff Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/124.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/125.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/125.manifest new file mode 100644 index 0000000000000000000000000000000000000000..496a3123bd60a7cbf22da8d18ecb787609121b14 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/125.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/126.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/126.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f760441454b88fe49eefdfd31fdd142dbf23f4c5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/126.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/127.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/127.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bae7aa91a4533c800bc388f9bbc489a3d09ee55e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/127.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/128.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/128.manifest new file mode 100644 index 0000000000000000000000000000000000000000..50c49b57c5ce39c4aa58aab1d3866922f61f57df Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/128.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/129.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/129.manifest new file mode 100644 index 0000000000000000000000000000000000000000..77cebded6d2483f2001bd4dda59acf07f3d39680 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/129.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/13.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/13.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cbc8f0e8ee7e916d40daae2941e0c2dac89c97da Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/13.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/130.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/130.manifest new file mode 100644 index 0000000000000000000000000000000000000000..10af77c825037ce9bea54053e14f12b8779746c4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/130.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/131.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/131.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9c00dfc3350663fd74fac7df004bafee4155c743 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/131.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/132.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/132.manifest new file mode 100644 index 0000000000000000000000000000000000000000..49746006b4ccbdc446663caa6f7742590a3e338f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/132.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/133.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/133.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8c4dfcbbf0ba229e3ba64074b9d1da14d5f80b3e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/133.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/134.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/134.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5d5681fd9c9d76df149fcd20c74bf314b540ba57 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/134.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/135.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/135.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d4cfbed25863a21a2de3ac2e6de9beb7e044598b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/135.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/136.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/136.manifest new file mode 100644 index 0000000000000000000000000000000000000000..07809d58da16f7f4e387624015e6bc9538af2701 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/136.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/137.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/137.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9758f69f31d55b8ad0a9efebc60f1db417b364a5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/137.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/138.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/138.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0f2766afc513082a138081aa28ac6a553f0eb43c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/138.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/139.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/139.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5a861e4a5ae39e9d2f34bdcde742340086d6affc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/139.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/14.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/14.manifest new file mode 100644 index 0000000000000000000000000000000000000000..701133d3823f011e134f07c8053e7581b0169583 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/14.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/140.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/140.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a393e5abb0634c8ef53b508140f0e6f526ba3110 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/140.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/141.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/141.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a64c45fba49be73e0d32090e4a54b654709ba36e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/141.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/142.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/142.manifest new file mode 100644 index 0000000000000000000000000000000000000000..449c2615105597cdc004d96aeaa5a4ee667db27d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/142.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/143.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/143.manifest new file mode 100644 index 0000000000000000000000000000000000000000..98942f9edbb4b8eee98dd2f9069bb0e577f4754a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/143.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/144.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/144.manifest new file mode 100644 index 0000000000000000000000000000000000000000..de87d9a82b542baadabf581eb44bb3f9f7acbc04 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/144.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/145.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/145.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a9be2e05015c902db052b6927061356a7b3e49ee Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/145.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/146.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/146.manifest new file mode 100644 index 0000000000000000000000000000000000000000..11726780cca533c5db3edc9fca287dd06d1b39de Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/146.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/147.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/147.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a0a6e3a3b020fbfbbc1f1ce19da0d7e738e80525 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/147.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/148.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/148.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fc1c65fc59c5c25b130bf0fefcd57a7a903b447a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/148.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/149.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/149.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d6983842f8e82887a85e5699243d83ad7e4e6af9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/149.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/15.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/15.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2262f782b6cd1b9a2295c4f26dff9bcf7ad0ae66 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/15.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/150.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/150.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ea8169c1d151e1f085303d532d1a5497022299c7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/150.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/151.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/151.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d39c44c617071dc463e774e00dfd09481d8fa97d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/151.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/152.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/152.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3d769a40283c2b797b10f490a19bdc01f873c99e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/152.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/153.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/153.manifest new file mode 100644 index 0000000000000000000000000000000000000000..70b726fd7604f42463beb1a89185d86723b7a28d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/153.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/154.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/154.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0c3105ec7eb280495e05e577e61b9a59da195d5d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/154.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/155.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/155.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9e24e14da9d925025dd3a2e631da7ba5845896e1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/155.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/156.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/156.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9567095b77a7482ea34ae2fc0ab287009ba0bac7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/156.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/157.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/157.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c73ca11be999615ec99a594972a1b56582f55758 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/157.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/158.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/158.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a1b3f72af49c4d64e927c1ab52a09daf4cc0a3e0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/158.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/159.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/159.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cef523f007c680bf1325fd9106b6eb2048349392 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/159.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/16.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/16.manifest new file mode 100644 index 0000000000000000000000000000000000000000..270be904ad371067b1508c6d7d60169c07454f3c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/16.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/160.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/160.manifest new file mode 100644 index 0000000000000000000000000000000000000000..740706075b2ac118369a5c9df1a8c4d0fcda40c2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/160.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/161.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/161.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0c62ee63e5714867c869109f2891721705e08058 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/161.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/162.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/162.manifest new file mode 100644 index 0000000000000000000000000000000000000000..33170afafe82cf5346a8baff732b466d9cd97eb3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/162.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/163.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/163.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b8e4654794d44538166a48b379ae6ad19eac7b62 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/163.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/164.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/164.manifest new file mode 100644 index 0000000000000000000000000000000000000000..442be9eb903ecaa6054104c51d3eceabe0b037a7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/164.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/165.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/165.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9b6323a91890fe59603b5bc89b5543f602233b13 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/165.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/166.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/166.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b2f53d457db2763865e653ddf48fe5b74f2a7933 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/166.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/167.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/167.manifest new file mode 100644 index 0000000000000000000000000000000000000000..73f613178767f3602a0c317e0f4f13f30f72bce2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/167.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/168.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/168.manifest new file mode 100644 index 0000000000000000000000000000000000000000..402a7d691d1fc3d6dddc75a32bc50f28bce866d7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/168.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/169.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/169.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c3a70a2c4c7fa73f760baad933d3553ee8ca5295 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/169.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/17.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/17.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bb9adf7ef9c99741adc95f8815ce2867b434eaed Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/17.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/170.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/170.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1b7499a2fb9f47b036df95ce29bd3d48c09ae485 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/170.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/171.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/171.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b0e831746c04984f3829a8e46e2ed484f6f5afc9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/171.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/172.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/172.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2951bc935b1327188b87e8c1c8c38fd9e680c735 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/172.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/173.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/173.manifest new file mode 100644 index 0000000000000000000000000000000000000000..34ca11f96470a4dd92d72fd745d87e874efd8755 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/173.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/174.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/174.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1430abe413bf5229ef31dae8783f683e7fa2b1c2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/174.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/175.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/175.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ee63e75802d2b9d7cdea5ad582f3d9b9df7a399e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/175.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/176.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/176.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f8322de4dc55e76fd9c6f8ea3d7e50db1dc37779 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/176.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/177.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/177.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0f06366cbd0fd72ed018428ab6d1edc4a7d9dbbd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/177.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/178.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/178.manifest new file mode 100644 index 0000000000000000000000000000000000000000..95f282eb4a3cbf1c7d5cc225f62d15f32ac7e709 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/178.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/179.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/179.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2834591a04cbacecb11e08a4543e7a08a9633955 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/179.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/18.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/18.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6532f78d9efcffc0f2d865d79138972af56aa6f8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/18.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/180.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/180.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f4d8ad58b2edff94133002fd58c49087fa62a4ba Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/180.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/181.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/181.manifest new file mode 100644 index 0000000000000000000000000000000000000000..762f0d90710fb0d8c4a2517cc3e47d444b95afa6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/181.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/182.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/182.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e751bf4ca38e4c81ae3f400736c3eac1509e36a1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/182.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/183.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/183.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9fec5d991f6c6e0894e73dee50cc0b4ae6fb1a91 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/183.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/184.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/184.manifest new file mode 100644 index 0000000000000000000000000000000000000000..69ef03c75cf3c558599d4979775e01874d610ebd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/184.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/185.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/185.manifest new file mode 100644 index 0000000000000000000000000000000000000000..44c0b6725c2a43f458c9f74186f3650228e4cdd7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/185.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/186.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/186.manifest new file mode 100644 index 0000000000000000000000000000000000000000..681c4f3c08791ee20f33196ee36d8938227f69df Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/186.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/187.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/187.manifest new file mode 100644 index 0000000000000000000000000000000000000000..47ed5a29afe14dbec5a609bf63ff6e4c3d55627d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/187.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/188.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/188.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5cda1c887eb3f9eb6ab207150194f7792e35b5ac Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/188.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/189.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/189.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c7df67519f07448fd3d3426ecf173e68db37c2f5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/189.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/19.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/19.manifest new file mode 100644 index 0000000000000000000000000000000000000000..97d5f828782749e859fe88e7a07f660110415a7e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/19.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/190.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/190.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e8b4f0eb7f23198054b212502cece8b20634b985 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/190.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/191.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/191.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8d3fe1ed463733b7791c0908b842d649cebe9860 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/191.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/192.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/192.manifest new file mode 100644 index 0000000000000000000000000000000000000000..333aa6abd716e9f54c98201a689a7b2860b0286a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/192.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/193.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/193.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7ca498d4917a2d270b1d8dd0f29fd8bc48de3a13 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/193.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/194.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/194.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5bb5ec99ca8b8ca25074b12d3be0c5f20848eeeb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/194.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/195.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/195.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3092d0f160729ad6c2a0c4e4bb48da5ad651751f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/195.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/196.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/196.manifest new file mode 100644 index 0000000000000000000000000000000000000000..16d5a49558f24ddae0f2669020b3d8636cf18f95 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/196.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/197.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/197.manifest new file mode 100644 index 0000000000000000000000000000000000000000..35c3398ffdbc2a8adb208cce376f245faff18c5f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/197.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/198.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/198.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6d89459d02fa906520cec1b0696e84839cd1b5e7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/198.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/199.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/199.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7efa13fd6e13de73025a6def8b1b31091d4217f1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/199.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/2.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/2.manifest new file mode 100644 index 0000000000000000000000000000000000000000..35a18295c0906621618dbc3c7a6926ee7a92841a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/2.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/20.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/20.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b8afba2436c41dcc9b44ead26dcdd6b8d483c143 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/20.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/200.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/200.manifest new file mode 100644 index 0000000000000000000000000000000000000000..adf9bfa91dd5ba8a8ff3488a966b499c9f3b1ea5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/200.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/201.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/201.manifest new file mode 100644 index 0000000000000000000000000000000000000000..36aac8f72aaac9695107ee41f5c1690961092220 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/201.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/202.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/202.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3ac9a8a30499a21fcba93c7fce48348991199a9a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/202.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/203.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/203.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5466f698b05bbfcde1bc98dba65cd5f3f4074af4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/203.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/204.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/204.manifest new file mode 100644 index 0000000000000000000000000000000000000000..133f742ceece5c67322d93bbbd020797543e648f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/204.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/205.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/205.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d2b6ec1873206f159f1ceb7146c3ed549d90011a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/205.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/206.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/206.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b85f489cffd394a6e438d8f6c98230e3ffd32e8b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/206.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/207.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/207.manifest new file mode 100644 index 0000000000000000000000000000000000000000..927a946904c3a737a6eccdea1b6e2f04eea2c5e0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/207.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/208.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/208.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4cecd6c875daf32fd5a11498f509c0c0e3a05f53 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/208.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/209.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/209.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1c5406b8f86da3de1b2405ccbf0f7fdf83a6f633 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/209.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/21.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/21.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8566402fe4fe6be1b044292ad5050c51bccde518 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/21.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/210.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/210.manifest new file mode 100644 index 0000000000000000000000000000000000000000..831c1dca7e6d0ecdbc53353d11e10e7429e620ae Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/210.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/211.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/211.manifest new file mode 100644 index 0000000000000000000000000000000000000000..071ba5b1d1371c3961015aa6614cb66519eb3ccd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/211.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/212.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/212.manifest new file mode 100644 index 0000000000000000000000000000000000000000..907bc3f3775eb7358881f1d8a9536b1bb920e0c8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/212.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/213.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/213.manifest new file mode 100644 index 0000000000000000000000000000000000000000..97b34b559e9041ba55e630ac7f9c297491a7b0be Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/213.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/214.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/214.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7c3dfdab024271f68fb596a415e1d7bb95fe4335 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/214.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/215.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/215.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c6825205ae3ad8f8523951d2c815d9665a64adce Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/215.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/216.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/216.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c7ff96867ffb1edf460db85cfa308a4991cc619f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/216.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/217.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/217.manifest new file mode 100644 index 0000000000000000000000000000000000000000..86547d1657b0221e8dc453683f022e6b29c0124b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/217.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/218.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/218.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5fee0f01d00643d6f040be14f8220cd1fbbc5c89 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/218.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/219.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/219.manifest new file mode 100644 index 0000000000000000000000000000000000000000..87af8b019c06a43c58229f4b9fc33ebdc59f158f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/219.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/22.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/22.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5b63eced9844b8e5c1d9c45900cba5329923b398 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/22.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/220.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/220.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4ed4d72e02212075a6a6e578aa5af327d586bfcb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/220.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/221.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/221.manifest new file mode 100644 index 0000000000000000000000000000000000000000..203c70d14ca760b2484ac497e7fc2a1c0856c34e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/221.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/222.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/222.manifest new file mode 100644 index 0000000000000000000000000000000000000000..66a1362a9196a56eee6b80ed0389c7673332cb1a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/222.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/223.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/223.manifest new file mode 100644 index 0000000000000000000000000000000000000000..417c63adc15046b3c39d8427dba02a33bfe549fd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/223.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/224.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/224.manifest new file mode 100644 index 0000000000000000000000000000000000000000..222f825a7d2daa186edcd7b73cfddce439c1e6b1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/224.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/225.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/225.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cf6b96ca8da8ef7cfcf74e30906118d3083173db Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/225.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/226.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/226.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c112319efab5f1cd44de7ac6e7ba61a9e15d0b23 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/226.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/227.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/227.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bd80d8d18e6b39d9aa8acd4d0c94e325f116fa3f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/227.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/228.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/228.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0359e655b8a95c55f412f8bf4d8fe8fa1f0a89a3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/228.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/229.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/229.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c85e4494eeacb89c7cadb3abb0e4708cd057f364 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/229.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/23.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/23.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e8f9d1df9d55258a82f3a3ebc03fa47459d6d879 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/23.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/230.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/230.manifest new file mode 100644 index 0000000000000000000000000000000000000000..49291a2ed06446998c750610594dfad0cfbf0062 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/230.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/231.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/231.manifest new file mode 100644 index 0000000000000000000000000000000000000000..28a856b0f35334154c1343fc1bf5bcdf1747359d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/231.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/232.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/232.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bb838a76dbfcf412fe795d2be16ef1b14d300ee1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/232.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/233.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/233.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8528cf9ef97f7d4a0e1ac13e62e85a2d2c1417a8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/233.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/234.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/234.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0fce99f4a3d2b765cf80188fb395b6770e6d0325 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/234.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/235.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/235.manifest new file mode 100644 index 0000000000000000000000000000000000000000..80e69f8b91f6bb9954181414824e13d56f540e6d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/235.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/236.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/236.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d0aafaea64963680af8bcc5884b81600433d36c3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/236.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/237.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/237.manifest new file mode 100644 index 0000000000000000000000000000000000000000..40ba24348f3571acab6e287df0038d660674e14d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/237.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/238.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/238.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f5702fa8e0471e3a0d2e3773ed4201d96dfba563 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/238.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/239.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/239.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c522fe13f277a375e7c1c3486de788d290a997b7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/239.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/24.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/24.manifest new file mode 100644 index 0000000000000000000000000000000000000000..77939a8f1742e355051168fdc546c9601ef4f6ce Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/24.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/240.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/240.manifest new file mode 100644 index 0000000000000000000000000000000000000000..328168c984e49240f9127f8d3fded8ce5d731f4b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/240.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/241.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/241.manifest new file mode 100644 index 0000000000000000000000000000000000000000..128c83aa393bb4d487af37acc7fc3e3eaabcfb39 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/241.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/242.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/242.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ae112c5a7ba4920555f3d7ed30ee8ccef7b68073 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/242.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/243.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/243.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a8c3e7bc54e37310c3ee3094bd2326b2587b17e0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/243.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/244.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/244.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d489f47a36b2686a9525c15ab07d6e2d7a87c810 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/244.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/245.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/245.manifest new file mode 100644 index 0000000000000000000000000000000000000000..50f6cecf4882e848950e00f5f709f7e16caf8a64 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/245.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/246.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/246.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1e502ea4f121f6c2e7f46bbce2bdcc82e32baf72 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/246.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/247.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/247.manifest new file mode 100644 index 0000000000000000000000000000000000000000..83bca8002f1c6945e60b31b99b38e9d8f3521129 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/247.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/248.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/248.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f0ab36b10f29d4929f1373a1648a7d25c6411fcd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/248.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/249.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/249.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6af243dda44f4496d91b857b71b38b71eabfa97a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/249.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/25.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/25.manifest new file mode 100644 index 0000000000000000000000000000000000000000..de91665a0c541888774a147848875c61f1b3eca3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/25.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/250.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/250.manifest new file mode 100644 index 0000000000000000000000000000000000000000..aedb6bb34adfbaef2b3d0e43ead7c63871ebb780 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/250.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/251.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/251.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c7d75c805eaf29f3ac11e71fbda95b99bde1f774 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/251.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/252.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/252.manifest new file mode 100644 index 0000000000000000000000000000000000000000..84c825388a62c2ff8779b8ce81976a42ee8ae201 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/252.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/253.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/253.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d6f0889e4ea74c477ea1aca9922a4920693e31f1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/253.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/254.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/254.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f0ccc9d8cbf21b8d320f006672cc74ad850cb39a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/254.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/255.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/255.manifest new file mode 100644 index 0000000000000000000000000000000000000000..97d586f115b78b603e2e9279c344049679a61681 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/255.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/256.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/256.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e2a191c83afc3fa0368516a9ac581772d985eaa8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/256.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/257.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/257.manifest new file mode 100644 index 0000000000000000000000000000000000000000..66820bef765d3ea3d4f2263afe696ee72d684b1b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/257.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/258.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/258.manifest new file mode 100644 index 0000000000000000000000000000000000000000..812754ba9472b5cda6b726df48a8351fa8c18ab2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/258.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/259.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/259.manifest new file mode 100644 index 0000000000000000000000000000000000000000..06ac298488b5a518664870c98b2a2ac4c2731b73 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/259.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/26.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/26.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cef2aba7280404c7214848aa1717e17e8a6ad3b3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/26.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/260.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/260.manifest new file mode 100644 index 0000000000000000000000000000000000000000..71cc5337d6b8ec20ee1c0436802d561f56c74a59 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/260.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/261.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/261.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d29b7cde30d95c3c3410bb96add75f013c784c03 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/261.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/262.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/262.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6b8085bea92117b18424bcb7a117c75241ad9aa4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/262.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/263.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/263.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c81c463cb54a8c6367a6ea5035f7e4a124cc4fb5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/263.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/264.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/264.manifest new file mode 100644 index 0000000000000000000000000000000000000000..29250f05b477f42e3910fe732b47b8642645a423 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/264.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/265.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/265.manifest new file mode 100644 index 0000000000000000000000000000000000000000..da9112f2a0cde08af0b8f7dc448308b9d11fbc76 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/265.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/266.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/266.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e2dd8ade81964c2fde7fcb7785c3f5c8ee741c3d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/266.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/267.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/267.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bd1b88bb9ef80d5e97a7614b39f704bb6e8bef1f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/267.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/268.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/268.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f6da0759d4872e66c5d28af9d945c085188f3554 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/268.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/269.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/269.manifest new file mode 100644 index 0000000000000000000000000000000000000000..49dafba10d664a3bf7187c21d27d6a2f705fafbc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/269.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/27.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/27.manifest new file mode 100644 index 0000000000000000000000000000000000000000..778c8a87c63b652e8aaf0e65aeaac5d012113136 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/27.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/270.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/270.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dfda978faa85c2068be47d7f815bc4546399e7bd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/270.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/271.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/271.manifest new file mode 100644 index 0000000000000000000000000000000000000000..004b340de00eb8873bb9e6cd16fc232c6dfebb8a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/271.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/272.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/272.manifest new file mode 100644 index 0000000000000000000000000000000000000000..931b5c4251ecf224863807dd893f06922ddfa6e1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/272.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/273.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/273.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c43bd2eac2c6a88ac39a911a4331065d500d4351 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/273.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/274.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/274.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0d503260a2d330e405b740a1f6ab4354717bde7d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/274.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/275.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/275.manifest new file mode 100644 index 0000000000000000000000000000000000000000..aa44e38662347edd93ebfa36b45119ad48ab5453 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/275.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/276.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/276.manifest new file mode 100644 index 0000000000000000000000000000000000000000..873c6ce5496580900fb86dced19b7b7b8437fcb2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/276.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/277.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/277.manifest new file mode 100644 index 0000000000000000000000000000000000000000..34d3aa1883f8befb722fa970807bca723b0a97d4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/277.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/278.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/278.manifest new file mode 100644 index 0000000000000000000000000000000000000000..102cf9d14e752c6e9a4e83e1124a1b1bab3533ea Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/278.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/279.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/279.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0bcff107ebaeb2ce4a7e5925a7cd32e063f71a99 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/279.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/28.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/28.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b0c6ab762add81eea9d47ac1300da4c5eb791dc2 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/28.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/280.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/280.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ebf3e9ee3b7aadb55190f43e4706832f1b81297a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/280.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/281.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/281.manifest new file mode 100644 index 0000000000000000000000000000000000000000..565bc0a38463e808b502e1208ba2fad700d8f01e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/281.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/282.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/282.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8e267d4489fd1a9d00e78de710f1b446d3f7c76b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/282.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/283.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/283.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fedb2ca7074d81b819d79d25e2356bab9dd59876 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/283.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/284.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/284.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d381ae811ece210b2c9dde3ae95fa45ffcfca0ab Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/284.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/285.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/285.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9ea8574b5c12950b1a2c9c3e3804a7c05913e1af Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/285.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/286.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/286.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b089acf73e78b085d9a84f5533cb1f7335f59ff0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/286.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/287.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/287.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b35ccddc8ea05be3ab414f2bf75ead55dc5cfbb8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/287.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/288.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/288.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e72101ce1773f4889a3a9c3091aed0321f543254 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/288.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/289.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/289.manifest new file mode 100644 index 0000000000000000000000000000000000000000..06e4892c878bbc2610acc63bf4d69c4b287dee2e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/289.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/29.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/29.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0860467db3814691f017627c3f7190c6c9b38856 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/29.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/290.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/290.manifest new file mode 100644 index 0000000000000000000000000000000000000000..df5b66b2d8646934ae00fec4d05ef86722475975 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/290.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/291.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/291.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b128e20f03df2ed9b58399c0de162532db8fcd4f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/291.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/292.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/292.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f3b9fdb06e114882a066d4172d3209e525d59714 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/292.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/293.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/293.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d1795abe7690a0fec499e6942d32e7526374fe74 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/293.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/294.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/294.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3c5e95c703a5f97803ccaaac82d4a5fe3193f7cb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/294.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/295.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/295.manifest new file mode 100644 index 0000000000000000000000000000000000000000..833a77c4c835246d3ea5662e0c93ca6e03cb3ca6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/295.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/296.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/296.manifest new file mode 100644 index 0000000000000000000000000000000000000000..62b503e74b8da5bfc2e54ff0363212b9ab260a08 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/296.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/297.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/297.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2dd731e0617772fea24f1cfc9124c58739968209 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/297.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/298.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/298.manifest new file mode 100644 index 0000000000000000000000000000000000000000..aac68f08131f956a510c736cb1c72eec705e6ef1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/298.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/299.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/299.manifest new file mode 100644 index 0000000000000000000000000000000000000000..66224fc02e3301a1f592e693a37e307627ba93ee Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/299.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/3.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/3.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0f62f3ff0b1a0ece040d7ab0396b5a28091b0978 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/3.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/30.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/30.manifest new file mode 100644 index 0000000000000000000000000000000000000000..16ebcd34dba6b615e5f51ca4353185e4592419ef Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/30.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/300.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/300.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f14fd15f8234a06224517a1451171d5a3484b4e9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/300.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/301.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/301.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f4c02b1b0bba0f0fe9110f757575e3a4cb9cd86d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/301.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/302.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/302.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6a9f42a532c36475bee12fb6bd0bfc0e3aa53842 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/302.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/303.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/303.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bb226f79018a904d519a0fc3f17bfc3766ced3b4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/303.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/304.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/304.manifest new file mode 100644 index 0000000000000000000000000000000000000000..766e5bcf030869c38ce4e994faf8730905f886b5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/304.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/305.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/305.manifest new file mode 100644 index 0000000000000000000000000000000000000000..92aab82645e36fb6d34a5fb932f2a625c21f8c2f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/305.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/306.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/306.manifest new file mode 100644 index 0000000000000000000000000000000000000000..49b74f78aa1dba47c712a16610ce502997f5f85c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/306.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/307.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/307.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8275d97980259c0d67b084dbd03465a702190b95 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/307.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/308.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/308.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9ec8d74f8c0aa902bb930c7df267cc6cc09b2733 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/308.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/309.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/309.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c37a948250237aaa10de807209b4f08621ab673e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/309.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/31.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/31.manifest new file mode 100644 index 0000000000000000000000000000000000000000..da948f832d68a5476d941c27cb41aa7a20d9211b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/31.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/310.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/310.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5460bfbc858af9735df513c9ad79579bf83218aa Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/310.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/311.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/311.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ab9e5873a57ca14d146dfd5391e0b2a6c1c968d5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/311.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/312.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/312.manifest new file mode 100644 index 0000000000000000000000000000000000000000..351c3e42453109e12ea530f9005cb273b77718c4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/312.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/313.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/313.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c27aceb193bb2edbb98704daaf66c8001d5bf09e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/313.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/314.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/314.manifest new file mode 100644 index 0000000000000000000000000000000000000000..39105b402a3354fc62b00645185559335d04f6cc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/314.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/315.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/315.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b94221dc56baa5a08a11d5faa20d8124d0292421 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/315.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/316.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/316.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6a0c4da0367120c1425b9dd5bbe533863816e760 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/316.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/317.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/317.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7a3465b60a66409c41cece171d9384d21c0e6fc4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/317.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/318.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/318.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2b8e9a8152ee8f7d77a3aced45c518df9f8a55f0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/318.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/319.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/319.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c182bc242a1b64cc9cf29f8cb6c7906035effe64 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/319.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/32.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/32.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8df9125f862353f66d6a8a1387cedbc96d11f71e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/32.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/320.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/320.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2967fcbd475ed0fc3dbf1b5e2d2d90dbe31de4b1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/320.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/321.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/321.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0bd419af5751d8465e42b0cd2cfadd12c1da94f5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/321.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/322.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/322.manifest new file mode 100644 index 0000000000000000000000000000000000000000..58e467f6c66692bb530a50a911658a9f82c08f71 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/322.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/323.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/323.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d4222dbc0b0a9552b6beeec7f9568181e426e01c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/323.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/324.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/324.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6263afed9e601ec661649c0e88940396fbc06fe0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/324.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/325.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/325.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7261b5827e41ad0b3c1b0125868b9a44cc124d5f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/325.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/326.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/326.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a9413ffbdd3245ab58bd6fa7f39a168247a8a0e0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/326.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/327.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/327.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b8f7debac5e9c9c8c1c74fca12c3bd544cbde9f8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/327.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/328.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/328.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ae35f9725950eed2bd4f814e26c13ef0d36db356 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/328.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/329.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/329.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e3f7f9167295662e44c7bf8369b4bc1063f6ddae Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/329.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/33.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/33.manifest new file mode 100644 index 0000000000000000000000000000000000000000..33ad614dfb86a2136a7401cbdb6b3ed31b87180d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/33.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/330.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/330.manifest new file mode 100644 index 0000000000000000000000000000000000000000..00e804d7cde10f4e5e96e7a1d0b1b1ff8a3b19b5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/330.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/331.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/331.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b7a8e020926311efbe8fb7736c9246baa077663c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/331.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/332.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/332.manifest new file mode 100644 index 0000000000000000000000000000000000000000..560cb7d1adadb4f8ea39791ae141df355c6b9ac5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/332.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/333.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/333.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a73a396192a97d72f47f64191e5520cbc79c3170 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/333.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/334.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/334.manifest new file mode 100644 index 0000000000000000000000000000000000000000..09ba8aa13903e4072cdf38627f23f8af21cf9f52 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/334.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/335.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/335.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e293c48e7e9e01b558ebbfb5323d0ae964ba8ed1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/335.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/336.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/336.manifest new file mode 100644 index 0000000000000000000000000000000000000000..92f68977e04827b81cd2523f8565780d467d6e84 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/336.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/337.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/337.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5ba6ce3eb1d56211b126d90e2af8bfbd2e39ca39 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/337.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/338.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/338.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8d66b7a714e360bfbec7d404447f1188ea00e78a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/338.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/339.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/339.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c78d32b80716d1f6a73a0ec1d82b4a0c50b84511 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/339.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/34.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/34.manifest new file mode 100644 index 0000000000000000000000000000000000000000..24272accf8dc17d3ac5975d4050bb948c50d8283 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/34.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/340.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/340.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7c19ac03aee6bcd58624431875f33b0dc011d61a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/340.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/341.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/341.manifest new file mode 100644 index 0000000000000000000000000000000000000000..80807e0aaa5fcf6856ad4d1c2448cf014d11d406 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/341.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/342.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/342.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4bcfe4e5726400c99fbf21920d73927eb056843b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/342.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/343.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/343.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ccf00130138da2466628f6f56f0c74d8da68d7c7 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/343.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/344.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/344.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ef831255cd36c992e17b3aaf6c9de4eb5273c4ed Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/344.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/345.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/345.manifest new file mode 100644 index 0000000000000000000000000000000000000000..16c298d6220b6ea6c10d96470e71b389f96d446d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/345.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/346.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/346.manifest new file mode 100644 index 0000000000000000000000000000000000000000..cad3fddd6bac25c645e50f9d7b3ba2420de9fafe Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/346.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/347.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/347.manifest new file mode 100644 index 0000000000000000000000000000000000000000..937a4e60c41f030bb66a31c53db8f6ecc44bd0eb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/347.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/348.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/348.manifest new file mode 100644 index 0000000000000000000000000000000000000000..497878b899c8c2232fa35939c0e723ceb972ad24 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/348.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/349.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/349.manifest new file mode 100644 index 0000000000000000000000000000000000000000..05f7657333588b67ea3a0f9e7bb7347b14c50909 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/349.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/35.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/35.manifest new file mode 100644 index 0000000000000000000000000000000000000000..537871b12224d18254e3e1307def74e821550a34 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/35.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/350.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/350.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1e0b3777f42c812bbfdeea6f9ed945c3054fe0ea Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/350.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/351.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/351.manifest new file mode 100644 index 0000000000000000000000000000000000000000..46894430b01f7de649da111bcd38f548b8ec578e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/351.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/352.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/352.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e84298785fe916a8d9046c21263f2c3e32a9fe75 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/352.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/353.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/353.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c5a781bde9bbef4cbfc464fe00318deeac237278 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/353.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/354.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/354.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5dbc9ad2dbfa4c620da07eab97bad5ded3203cf4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/354.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/355.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/355.manifest new file mode 100644 index 0000000000000000000000000000000000000000..df8a7db91145e5555b032bd1846e4a1be68632a5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/355.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/356.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/356.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ccf47dabd5986df3412d8394fd060d045fef9562 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/356.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/357.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/357.manifest new file mode 100644 index 0000000000000000000000000000000000000000..35033a5737ffb0faebc66538a63c63428556130f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/357.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/358.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/358.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c0280958f00e35dffdcce9908031d74124530ca0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/358.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/359.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/359.manifest new file mode 100644 index 0000000000000000000000000000000000000000..670dc82b18d993e1f612e01ff822e24d9018503d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/359.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/36.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/36.manifest new file mode 100644 index 0000000000000000000000000000000000000000..eab00d2dc8d5f9d89fca333073011bb321fe5ac1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/36.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/360.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/360.manifest new file mode 100644 index 0000000000000000000000000000000000000000..88d147d0f9ddaf88b6fdf53cd18875bea4bcde7e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/360.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/361.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/361.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1d40faf5b881703d91870c54ab9035fb44035797 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/361.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/362.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/362.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9687d3e4a1e45988ee4462f9cd2f213f6d524c5b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/362.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/363.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/363.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d76479e19daeeaee6b0ba4d5d08c221c3235a76e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/363.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/364.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/364.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d0bce17b2be19cedd1cf4822108154246a62c023 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/364.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/365.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/365.manifest new file mode 100644 index 0000000000000000000000000000000000000000..06f7a46418fe80452de88d62b288994bcc1a5a55 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/365.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/366.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/366.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9792fc3090753898b8bd2dfa395a9ffe8ea67f94 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/366.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/367.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/367.manifest new file mode 100644 index 0000000000000000000000000000000000000000..add314451ecc331a2251e7dea1c2ff0cfd0af8c6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/367.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/368.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/368.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1c912f422a31843a13dd7b6313868a3c0e0c0b29 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/368.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/369.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/369.manifest new file mode 100644 index 0000000000000000000000000000000000000000..afd35573aca17982dc7d05a83cd4329eace556f6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/369.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/37.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/37.manifest new file mode 100644 index 0000000000000000000000000000000000000000..778536bda6b582471f8c715c094f8a601351100b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/37.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/370.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/370.manifest new file mode 100644 index 0000000000000000000000000000000000000000..09c1c95635df5c2115a7287d15988ef1dfc1055e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/370.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/371.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/371.manifest new file mode 100644 index 0000000000000000000000000000000000000000..756d683e7a1f890f9200962e7b8adfd46fa4a0d8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/371.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/372.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/372.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bbb559b7e5fef8548eb63106c48626a14a1f583d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/372.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/373.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/373.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4f5a3def329bf3fa023c0fd0a3deb0d1def6bdd8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/373.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/374.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/374.manifest new file mode 100644 index 0000000000000000000000000000000000000000..634c79545ea7e01936e04b57d6c998d9123b258c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/374.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/375.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/375.manifest new file mode 100644 index 0000000000000000000000000000000000000000..17ce1ef08a44b3b6d3d6146ee0a3aa51ea1f87d6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/375.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/376.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/376.manifest new file mode 100644 index 0000000000000000000000000000000000000000..eb988856dccfbba59a16e260ca36107b1b9907f5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/376.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/377.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/377.manifest new file mode 100644 index 0000000000000000000000000000000000000000..326c6446f5c3c3632ef1174220f8272d182910eb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/377.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/378.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/378.manifest new file mode 100644 index 0000000000000000000000000000000000000000..50c1810f5ce8e6775f82baf46ce40dab593e1c1d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/378.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/379.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/379.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4b965f635d05bdb4da6a83bf944dac73a4642942 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/379.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/38.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/38.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a475254d8fb088a8ab4beda63bd59123407dfa46 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/38.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/380.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/380.manifest new file mode 100644 index 0000000000000000000000000000000000000000..83900e1e4a65c876d9e63d335b40ac7f8c56943b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/380.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/381.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/381.manifest new file mode 100644 index 0000000000000000000000000000000000000000..56f35212ed261b10f72351df4a9224fa93f97ce0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/381.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/382.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/382.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d32f1624cd3a0317f358758c1c2a63dfbf5e1551 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/382.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/383.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/383.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6a6b50e636233af7691bd58ca50180f25ebbd1dd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/383.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/384.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/384.manifest new file mode 100644 index 0000000000000000000000000000000000000000..850ff536d488a8ff1dbc2db49fcff292b0f2e5ef Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/384.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/385.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/385.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ee712e240141af0a3381ee23994fd70781dd4f29 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/385.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/386.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/386.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6d56f327ffc783ff02a96fdead0493a701f24515 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/386.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/387.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/387.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8734c0789531f408ff650f496056d4ca563aca7f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/387.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/388.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/388.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c4bb232d56d50ba09bf24d238d1883e095fec997 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/388.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/389.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/389.manifest new file mode 100644 index 0000000000000000000000000000000000000000..46a9827faa2ee380ee4f463008c14f8e66f60bdd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/389.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/39.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/39.manifest new file mode 100644 index 0000000000000000000000000000000000000000..6eb2d3b294e901dd096e9997c47c3dc6c83cfb51 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/39.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/390.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/390.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fd3fa8222d45ce0595567b19cd17215683711d1d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/390.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/391.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/391.manifest new file mode 100644 index 0000000000000000000000000000000000000000..56b03a72b7238611a7b59c2898e103b3c9dcda51 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/391.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/392.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/392.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9f9e02c95b0b30905039993d0d6a5f296947f3ba Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/392.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/393.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/393.manifest new file mode 100644 index 0000000000000000000000000000000000000000..014af45333c98fb68558dc417f3f3e34c71c47a0 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/393.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/394.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/394.manifest new file mode 100644 index 0000000000000000000000000000000000000000..eb3e1a63427a23aae150ef0e09808850e0c100dd Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/394.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/395.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/395.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2a4e5ebd77b47a336ee2214c2d3daefb03aee059 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/395.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/4.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/4.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7b6703c8425f10853af072cb4fed3883d1f03283 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/4.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/40.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/40.manifest new file mode 100644 index 0000000000000000000000000000000000000000..23480deb564a2bf9fcb612cd2175242a99db4f76 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/40.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/41.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/41.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4b1248b0277a52132cc500f34383f017b13c1c6e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/41.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/42.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/42.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e6b0cee222baba4b38409a06e6d6bffcfbbf41a4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/42.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/43.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/43.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ed04be585afabb488d763ee37b46dbc76a828d5e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/43.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/44.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/44.manifest new file mode 100644 index 0000000000000000000000000000000000000000..bac3e1937656f21524c3864ee69859942b7a7fdc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/44.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/45.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/45.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e9f7d55796aa22e0b4a351b6ef427f202853fc4f Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/45.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/46.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/46.manifest new file mode 100644 index 0000000000000000000000000000000000000000..2d5648383c1958849018ac3713dae4fe0e1ebc86 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/46.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/47.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/47.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9256fe39d832b6771c0ee8ac1e9520b56a47e322 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/47.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/48.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/48.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8669fa261d8b33a9035e6a216728daae10ee14fc Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/48.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/49.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/49.manifest new file mode 100644 index 0000000000000000000000000000000000000000..472554b534a7caee652ad72ccb5a46f148be7e3a Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/49.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/5.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/5.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c81f68a502ce2a9a864a79c2fabb68cb6f7e791b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/5.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/50.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/50.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d0160a273259d0aa030675f64c646f78a7cd3474 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/50.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/51.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/51.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3f956604fbb2af56fd576e58ed35aa26255defd1 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/51.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/52.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/52.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3b862add9c1e515ee2c8e009f5cf65922f9a5501 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/52.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/53.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/53.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9ecc677590c84d3436c8ff474f55db48e35d051c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/53.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/54.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/54.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9aab58451872c36cd4b15062985637f06ede435d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/54.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/55.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/55.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c5cce1e9a0cc6dcc5302d126bdff6356aa2c2524 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/55.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/56.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/56.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8cb0f46a4d933728ed6d711e049e5b54da5a8070 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/56.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/57.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/57.manifest new file mode 100644 index 0000000000000000000000000000000000000000..857ecf13d67352a38bef2fbf828222a4cbb45fd4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/57.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/58.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/58.manifest new file mode 100644 index 0000000000000000000000000000000000000000..abc19cc1f3ea5adf9527d90755ea0e2a8402049d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/58.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/59.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/59.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8207fff2eb71532dd55f1b705556f8607abd1f13 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/59.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/6.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/6.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c8a06bc328373c60124f97b844cb981f5995f439 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/6.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/60.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/60.manifest new file mode 100644 index 0000000000000000000000000000000000000000..55dca229709b387a25c0ead718c5323016abc8d8 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/60.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/61.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/61.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e5b7a6ecd2916e83f3a1757ba9cbb4f807c7b878 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/61.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/62.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/62.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f88d8ff82afd1a0e2af271dcd7b5d7477f0e7e0d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/62.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/63.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/63.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3edfa0a81e8d299e005bfd9f4dce6c85a8c7b8b6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/63.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/64.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/64.manifest new file mode 100644 index 0000000000000000000000000000000000000000..9ab90ba1d15ef9a28a9833c8fc2e6aab766d7608 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/64.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/65.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/65.manifest new file mode 100644 index 0000000000000000000000000000000000000000..595538aa418f47f09733870d37c9537ef7539394 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/65.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/66.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/66.manifest new file mode 100644 index 0000000000000000000000000000000000000000..fd44df268e4d87c1c7d0dc029ee194a5999415f5 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/66.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/67.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/67.manifest new file mode 100644 index 0000000000000000000000000000000000000000..146338ecc1058ad1a723765c59ce3db9c7397eff Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/67.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/68.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/68.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ac0ded39fcc3f8f0d8a09966b868e98eeddd7798 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/68.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/69.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/69.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d29ee6ab2a4f0516a0c3524516d42f716677683b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/69.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/7.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/7.manifest new file mode 100644 index 0000000000000000000000000000000000000000..686351de0e598623581b7fa5d797b29b4202baec Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/7.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/70.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/70.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b6f6b6e8377f2b01bbb74e8fb06f4a92f2db3fc3 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/70.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/71.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/71.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1ebb3374a3bd4662a48258befffd60d4501c5e69 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/71.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/72.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/72.manifest new file mode 100644 index 0000000000000000000000000000000000000000..dddf793ab4f8eb1062d7e83ac9c08bbfbd0da5ec Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/72.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/73.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/73.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a294c5853a69716d8be1972969703096d06e26be Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/73.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/74.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/74.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a9f0ad454565a1c572f4a292543a64abade9c05b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/74.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/75.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/75.manifest new file mode 100644 index 0000000000000000000000000000000000000000..c88e46bbc944dce67aa6aaa5947659aa4a826144 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/75.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/76.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/76.manifest new file mode 100644 index 0000000000000000000000000000000000000000..8376ebc26dff243ad4b8f37ffd101773ecf63590 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/76.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/77.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/77.manifest new file mode 100644 index 0000000000000000000000000000000000000000..11ea7b58dd87c707eeeecf588bd6a33b00b6f376 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/77.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/78.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/78.manifest new file mode 100644 index 0000000000000000000000000000000000000000..7081a08ea154bf5480c94a2cedd3cd8a46b6e15b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/78.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/79.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/79.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a1b5803a0181575ac5b3293ae46a7830c79b5933 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/79.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/8.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/8.manifest new file mode 100644 index 0000000000000000000000000000000000000000..335e43ff21e81ee99378320f3feb9a3306ff3eeb Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/8.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/80.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/80.manifest new file mode 100644 index 0000000000000000000000000000000000000000..29379ef43435672b6941dff0e1f6a3ef8a86032b Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/80.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/81.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/81.manifest new file mode 100644 index 0000000000000000000000000000000000000000..e2ac37014c038107736562714c09ef336ee5323e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/81.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/82.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/82.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d6c53d06508da6603f0e1c6fbdaac4d75049a545 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/82.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/83.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/83.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ed037a94ab82b793953f6f4fa77b3aca00eb4fba Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/83.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/84.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/84.manifest new file mode 100644 index 0000000000000000000000000000000000000000..28f3e3008ce27548bcbf6c48808f4d6e40ab662d Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/84.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/85.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/85.manifest new file mode 100644 index 0000000000000000000000000000000000000000..47c4883e5dc4e416472de086312859cf87d3de79 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/85.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/86.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/86.manifest new file mode 100644 index 0000000000000000000000000000000000000000..f8885085a7f25d4b1c7128e5e0dd4fddb6a012f4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/86.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/87.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/87.manifest new file mode 100644 index 0000000000000000000000000000000000000000..193edae6b82b2800d4b427a4a8fa22d7583a1bec Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/87.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/88.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/88.manifest new file mode 100644 index 0000000000000000000000000000000000000000..5cf7f717778244f59dfa461310d921fa9a78c579 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/88.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/89.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/89.manifest new file mode 100644 index 0000000000000000000000000000000000000000..934567347f9ed7824df71fd4b1f5c1edfbaf3c85 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/89.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/9.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/9.manifest new file mode 100644 index 0000000000000000000000000000000000000000..3fe7eaa734c9497b9834d82651228e446c6027ca Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/9.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/90.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/90.manifest new file mode 100644 index 0000000000000000000000000000000000000000..4929afc3ea6806bc20b0f05ba8b64cdcdde96a04 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/90.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/91.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/91.manifest new file mode 100644 index 0000000000000000000000000000000000000000..446daa205c37229bd919b7b3da6ce01f137b40d9 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/91.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/92.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/92.manifest new file mode 100644 index 0000000000000000000000000000000000000000..a87d03ddef48b038769fb1fa28dcac460efda13c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/92.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/93.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/93.manifest new file mode 100644 index 0000000000000000000000000000000000000000..b70e0f96a9410e9c87fceb5eeee5457cadd8b9a6 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/93.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/94.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/94.manifest new file mode 100644 index 0000000000000000000000000000000000000000..ebb7943236a38485e57a79932f4996eded3c945e Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/94.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/95.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/95.manifest new file mode 100644 index 0000000000000000000000000000000000000000..d231da63e09ece4c918c605f04437b71d2cfbe5c Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/95.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/96.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/96.manifest new file mode 100644 index 0000000000000000000000000000000000000000..0b098cf30c3b0c9b2e63c4628015b8fc90e91f37 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/96.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/97.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/97.manifest new file mode 100644 index 0000000000000000000000000000000000000000..1768521f425abe957164a2d239f01ad036d682f4 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/97.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/98.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/98.manifest new file mode 100644 index 0000000000000000000000000000000000000000..087840f219c6ef1603cb98f78a79fa10e3497633 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/98.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/_versions/99.manifest b/.lancedb/nltk_chunking_BAAI.lance/_versions/99.manifest new file mode 100644 index 0000000000000000000000000000000000000000..017801befd55a6f61db3180255e23243d443d364 Binary files /dev/null and b/.lancedb/nltk_chunking_BAAI.lance/_versions/99.manifest differ diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0093be1a-38b8-43a0-9956-6a875276e9e5.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0093be1a-38b8-43a0-9956-6a875276e9e5.lance new file mode 100644 index 0000000000000000000000000000000000000000..70b6ace1ac405bc7e4c93f0d03630bef73080d10 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0093be1a-38b8-43a0-9956-6a875276e9e5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be57a6058e4168b6c4eaf3a4d72c28424d4b9319eeaf7ae042eb082d230b9ebc +size 135278 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/00fa3dea-2049-480c-b310-6b28d9cfaef1.lance b/.lancedb/nltk_chunking_BAAI.lance/data/00fa3dea-2049-480c-b310-6b28d9cfaef1.lance new file mode 100644 index 0000000000000000000000000000000000000000..a76cb59fbc3e8ceb8e6ec573f65ca2a3d2c27c96 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/00fa3dea-2049-480c-b310-6b28d9cfaef1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:301f68e435b0da9eb36301560a3aeeec819b14cb597d885efb601486233cbd50 +size 148401 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/01f3eb3b-5734-4e2e-8477-d6905e49c3cb.lance b/.lancedb/nltk_chunking_BAAI.lance/data/01f3eb3b-5734-4e2e-8477-d6905e49c3cb.lance new file mode 100644 index 0000000000000000000000000000000000000000..d0e97a14be12845647325935592812da9b923470 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/01f3eb3b-5734-4e2e-8477-d6905e49c3cb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2ba447564fd0001a0aedc264e1a492113f63353f26f42c607be3fffa5742b7e +size 139748 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/02046106-26d7-4d08-b739-c91b96f47aa3.lance b/.lancedb/nltk_chunking_BAAI.lance/data/02046106-26d7-4d08-b739-c91b96f47aa3.lance new file mode 100644 index 0000000000000000000000000000000000000000..fedfd04e50ace6c6520e8be5ab8afe10a394a36f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/02046106-26d7-4d08-b739-c91b96f47aa3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c473411cf6d9ff6a363f7cd0939d2b31fdaf2fea623faa77d01fa156474d103 +size 137337 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/021a4d6c-f9e6-4bb2-84b8-105d1fcaeff7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/021a4d6c-f9e6-4bb2-84b8-105d1fcaeff7.lance new file mode 100644 index 0000000000000000000000000000000000000000..c3aa5736922bb4306db11debf2d2b7ae494c3cbd --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/021a4d6c-f9e6-4bb2-84b8-105d1fcaeff7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a723333c7b2167cbb34d987944121c6b0b16a3de739293b1cf938f9375d88dc +size 137550 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/02ae3540-151f-43a5-b280-5d1d528ba222.lance b/.lancedb/nltk_chunking_BAAI.lance/data/02ae3540-151f-43a5-b280-5d1d528ba222.lance new file mode 100644 index 0000000000000000000000000000000000000000..27dc9d17dbbdb8eb42c334bc7ce9197262e56bcd --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/02ae3540-151f-43a5-b280-5d1d528ba222.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:009c0b0105dfd1428aa2f344cad3ff5729968f32c33980361d103af92c9859de +size 137418 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/02bbade5-5dd8-4407-8010-007309e6b2a1.lance b/.lancedb/nltk_chunking_BAAI.lance/data/02bbade5-5dd8-4407-8010-007309e6b2a1.lance new file mode 100644 index 0000000000000000000000000000000000000000..a80025467ede6b4d99ca99684549fcd6d7d3c934 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/02bbade5-5dd8-4407-8010-007309e6b2a1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c17c144d69782ffd0a429acdb29d80c0b842b9156b52dbe981524b429bd47210 +size 136780 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/02c6256d-f989-479b-a354-027692a73ed6.lance b/.lancedb/nltk_chunking_BAAI.lance/data/02c6256d-f989-479b-a354-027692a73ed6.lance new file mode 100644 index 0000000000000000000000000000000000000000..e5fb3992644cdce5c41bacc798d88238e84deb0c --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/02c6256d-f989-479b-a354-027692a73ed6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f3f7cac8f3a01582d68942e089a5673795329b691262d5e75d4013e3f6d2163 +size 136655 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/02d13c18-d03f-4529-9fd7-ee3b440c0e26.lance b/.lancedb/nltk_chunking_BAAI.lance/data/02d13c18-d03f-4529-9fd7-ee3b440c0e26.lance new file mode 100644 index 0000000000000000000000000000000000000000..145ec2b9c4631d01f00adfb89c7ba4e3c5e1fd49 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/02d13c18-d03f-4529-9fd7-ee3b440c0e26.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f27a4d8a7ad4a86e7a246b7db8231b3eb360adedb04938391209acecea1a2d1 +size 136903 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/02f1e9f4-307a-45c2-a7d9-711784f0bc3b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/02f1e9f4-307a-45c2-a7d9-711784f0bc3b.lance new file mode 100644 index 0000000000000000000000000000000000000000..58ab37461cfb057f311308ac82a029c79d83644c --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/02f1e9f4-307a-45c2-a7d9-711784f0bc3b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b72937227309e5a88a36226172a4c9de9ba3daa6ebf79a88f924a3fd228d54f1 +size 139711 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/03180ac4-5e6e-4570-a4b4-5db83b56a5cb.lance b/.lancedb/nltk_chunking_BAAI.lance/data/03180ac4-5e6e-4570-a4b4-5db83b56a5cb.lance new file mode 100644 index 0000000000000000000000000000000000000000..d4e24c42db5c38af2c206ca5875622886eaea77d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/03180ac4-5e6e-4570-a4b4-5db83b56a5cb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcce4842561376a01f65d89f20d69081926e138a1c7c48d581774549dc739540 +size 137316 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/037c38cf-9f8a-4f49-939d-4858400d1af7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/037c38cf-9f8a-4f49-939d-4858400d1af7.lance new file mode 100644 index 0000000000000000000000000000000000000000..d37021b1034d37063cc0b8e2cc1e7948b1c04d25 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/037c38cf-9f8a-4f49-939d-4858400d1af7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ad7affe3571348efcf4479ceb7fe5172f4d96df26d05e6163b71fd6946dd8a1 +size 135700 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0601c18a-151d-47d0-82eb-aee9a6ac4b3b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0601c18a-151d-47d0-82eb-aee9a6ac4b3b.lance new file mode 100644 index 0000000000000000000000000000000000000000..eb8155a06061340a2eddcc0e7b1f68f7daded57c --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0601c18a-151d-47d0-82eb-aee9a6ac4b3b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22c8e4b8728e2084bdc394b51df8a662971cd91cb6f3e8f90174723345b05b14 +size 136707 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0614ae65-b908-4c1a-88ef-ebebd7cbfac2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0614ae65-b908-4c1a-88ef-ebebd7cbfac2.lance new file mode 100644 index 0000000000000000000000000000000000000000..521f9534e393899e98804ebf5e57b2cd9146a305 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0614ae65-b908-4c1a-88ef-ebebd7cbfac2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94839b0aabd63f3077f61df1a18a930654ec6b189cec174b3ba145f579fa2b7f +size 135632 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/07b96f51-6e8b-4730-aee4-f59f1b688375.lance b/.lancedb/nltk_chunking_BAAI.lance/data/07b96f51-6e8b-4730-aee4-f59f1b688375.lance new file mode 100644 index 0000000000000000000000000000000000000000..6648618d55a71d07776f3cab182ae3162ce4df58 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/07b96f51-6e8b-4730-aee4-f59f1b688375.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cca82cd20f139d603c1c04733c478cb663d5b48a9fcd72f20984c34d431dedf +size 137685 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/097eeda7-baea-4ead-9e3c-24c8d16960cd.lance b/.lancedb/nltk_chunking_BAAI.lance/data/097eeda7-baea-4ead-9e3c-24c8d16960cd.lance new file mode 100644 index 0000000000000000000000000000000000000000..226dcabc3f5bcd9dfee0041d1967ea983d4f9d45 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/097eeda7-baea-4ead-9e3c-24c8d16960cd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ebe9a06ce4935429c8d5905ee9dbcdd740de20e342c158aa6016232e8ab81f2 +size 136182 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0ab5b7b8-59c3-4614-91c4-f4373fd41879.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0ab5b7b8-59c3-4614-91c4-f4373fd41879.lance new file mode 100644 index 0000000000000000000000000000000000000000..218b2b78ee5c77a4beeda9c1c66e5412f2ea1dc7 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0ab5b7b8-59c3-4614-91c4-f4373fd41879.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1c844f31018d41701c5779b9d32479c0ef46faba7f604b12ec4b9b5740f28cb +size 136319 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0c01e13d-fbae-4fdd-a8a4-8d6d7df5d500.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0c01e13d-fbae-4fdd-a8a4-8d6d7df5d500.lance new file mode 100644 index 0000000000000000000000000000000000000000..4d62b006424b9da475c9a41df7acd24524d9f77f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0c01e13d-fbae-4fdd-a8a4-8d6d7df5d500.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad6fbcbf807f8dbe0e64e94eb01accc569c34d25635f9945461ee31884a11f67 +size 136691 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0c1070b0-450d-4d4e-921f-70a5ade39fb7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0c1070b0-450d-4d4e-921f-70a5ade39fb7.lance new file mode 100644 index 0000000000000000000000000000000000000000..42bf9da3b0aba006b9d169738346abc114c75445 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0c1070b0-450d-4d4e-921f-70a5ade39fb7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efd4ca105ebb2674b0ac1125a7804e6db49585217714a23eeda5bd1f69177bc5 +size 139762 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0d080496-c05e-4560-ae76-5976c3e425c3.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0d080496-c05e-4560-ae76-5976c3e425c3.lance new file mode 100644 index 0000000000000000000000000000000000000000..47092db82fc8c404dc7bd4aee2362bc8d3e1079b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0d080496-c05e-4560-ae76-5976c3e425c3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ccc70c0b0aeb27559da169c11c8d22690b469d7ebbe591888c5ee1d303e7d2d +size 136585 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0d2cea82-cae3-45a2-8b6e-0dfe4e089317.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0d2cea82-cae3-45a2-8b6e-0dfe4e089317.lance new file mode 100644 index 0000000000000000000000000000000000000000..c97b8ca984e920f96ef660c743eabcbfa7071639 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0d2cea82-cae3-45a2-8b6e-0dfe4e089317.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a61d4a314352c00c9228372e7412dd7c668c01d31bf26cb80a420df6f82fb18 +size 137547 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0d40367c-5932-43cf-bf69-abdbceeedfd0.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0d40367c-5932-43cf-bf69-abdbceeedfd0.lance new file mode 100644 index 0000000000000000000000000000000000000000..0a7d74456c6907260f5c9b29207de559740a74a0 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0d40367c-5932-43cf-bf69-abdbceeedfd0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79162f91d6d80158d974148e0a93da74f9fa03d7e315d6d2046f82d2e019463a +size 137251 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0d9a89f2-ff5e-4fae-8a3f-160dc0e67aca.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0d9a89f2-ff5e-4fae-8a3f-160dc0e67aca.lance new file mode 100644 index 0000000000000000000000000000000000000000..8b88a736c8b89666967cc40467244fc58a418850 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0d9a89f2-ff5e-4fae-8a3f-160dc0e67aca.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5c6f32712480bf15b35112277d9627b1ea675653cbf551c4d0dcc4fa73d59dc +size 136944 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0df454eb-c0ee-4eab-a51c-1975e90803e3.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0df454eb-c0ee-4eab-a51c-1975e90803e3.lance new file mode 100644 index 0000000000000000000000000000000000000000..1e9c4f7a63ba43e4b4d53129a2725555aca86d94 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0df454eb-c0ee-4eab-a51c-1975e90803e3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aebc9efb7dd775968a54f42916124d7dd4bd0d5f56be60552de65a22bbfad076 +size 135909 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0eab5dfc-d5cb-4a46-8391-d0c2e12b039c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0eab5dfc-d5cb-4a46-8391-d0c2e12b039c.lance new file mode 100644 index 0000000000000000000000000000000000000000..2c8928e1b8b2974f1e190f9b3f6b58fb5fef9089 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0eab5dfc-d5cb-4a46-8391-d0c2e12b039c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c973e3acf4f6822fe4bee37d2ed5b6066cc8ab50ef23e5fa5d20a26a9148f8d5 +size 140877 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0ec32f9e-5293-497d-9736-d5a3e03bacdb.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0ec32f9e-5293-497d-9736-d5a3e03bacdb.lance new file mode 100644 index 0000000000000000000000000000000000000000..65c8346df50b579510b9eee8b498af59997af59f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0ec32f9e-5293-497d-9736-d5a3e03bacdb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28bb1659263f1cee690f1d3a78ebea5c0557e9437838b665e86865cc8045fbca +size 137875 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/0ec8409d-428d-42ef-9052-6c4c8c49881c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/0ec8409d-428d-42ef-9052-6c4c8c49881c.lance new file mode 100644 index 0000000000000000000000000000000000000000..f01cca995d5b2f5c2987f09b9ac533519d984777 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/0ec8409d-428d-42ef-9052-6c4c8c49881c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4e5a171343c609b2cccd57fe1db0cbf9a36758e5dffdb614b30769103fb1eb7 +size 138895 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/10114d89-4dd8-4905-a98b-3075ca6fb127.lance b/.lancedb/nltk_chunking_BAAI.lance/data/10114d89-4dd8-4905-a98b-3075ca6fb127.lance new file mode 100644 index 0000000000000000000000000000000000000000..a156b264e4b40264f5660919ce012f63a2050614 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/10114d89-4dd8-4905-a98b-3075ca6fb127.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b738560e3f3812f89ef917fb22d11778bb51037a7e79abd11556f81ec003e6aa +size 138693 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/10f3dde0-a9fb-49c9-b06f-a325a67a3892.lance b/.lancedb/nltk_chunking_BAAI.lance/data/10f3dde0-a9fb-49c9-b06f-a325a67a3892.lance new file mode 100644 index 0000000000000000000000000000000000000000..03ba0a9a64f55a8f0126fd97b5fbc115bfacabb0 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/10f3dde0-a9fb-49c9-b06f-a325a67a3892.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:321d1f65cc93ec53bbfc7eba804fc8a89c293a9a907f9a65277415400c2e3d3c +size 135945 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/12497743-d35b-4fb9-9c02-d4e1a1b0ea58.lance b/.lancedb/nltk_chunking_BAAI.lance/data/12497743-d35b-4fb9-9c02-d4e1a1b0ea58.lance new file mode 100644 index 0000000000000000000000000000000000000000..b5fcf70a5d007c7e45cd9f1455d0d7d81c60455c --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/12497743-d35b-4fb9-9c02-d4e1a1b0ea58.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02e2eef5c6aa0f9eaf9030ebe3ce43062a02a0d6c0bf0c27b75232b5dff21460 +size 138958 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/12952516-7115-4d66-b7f9-b189204c39af.lance b/.lancedb/nltk_chunking_BAAI.lance/data/12952516-7115-4d66-b7f9-b189204c39af.lance new file mode 100644 index 0000000000000000000000000000000000000000..8e496a221e78815547f721e51ad15e0d411531c5 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/12952516-7115-4d66-b7f9-b189204c39af.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad43ae63dd8fc10c7a6d27114ff404a5935958d3f76f2209abaafe56952ec25c +size 138253 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/12b583a1-8630-43a9-868d-fc7857c081a7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/12b583a1-8630-43a9-868d-fc7857c081a7.lance new file mode 100644 index 0000000000000000000000000000000000000000..be2f51de589cbc5688dc92bd0976f8712ee68e10 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/12b583a1-8630-43a9-868d-fc7857c081a7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ea1338e488adddded89eb85082069f2dd9130481ebe0336e6e9873b9fc1081c +size 136421 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/13203a38-73ae-43e5-a0b4-ffcee7fd41c0.lance b/.lancedb/nltk_chunking_BAAI.lance/data/13203a38-73ae-43e5-a0b4-ffcee7fd41c0.lance new file mode 100644 index 0000000000000000000000000000000000000000..e2a2b79b52e11aa153c48d1253b2d1d4880c98fb --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/13203a38-73ae-43e5-a0b4-ffcee7fd41c0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fed5fdd9aa38fdd56f2eff612c7ec07bdb3db62001967a12515cd73dbd35fe27 +size 136105 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/133294a2-668c-4ee3-8f60-2f2bfc106b18.lance b/.lancedb/nltk_chunking_BAAI.lance/data/133294a2-668c-4ee3-8f60-2f2bfc106b18.lance new file mode 100644 index 0000000000000000000000000000000000000000..3c5daa800569d679bf9f1a4abf6ffcf4f66f9319 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/133294a2-668c-4ee3-8f60-2f2bfc106b18.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1382334ed7144efd8c722be563fe467fa8a14cfcc6d80eaf1f93e6e9b24bcae +size 135368 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/138c0ec5-09e6-4f20-aa72-b2b7a472c62d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/138c0ec5-09e6-4f20-aa72-b2b7a472c62d.lance new file mode 100644 index 0000000000000000000000000000000000000000..156d1c6a93dcb1da71712b5996b875b43272ec14 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/138c0ec5-09e6-4f20-aa72-b2b7a472c62d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:200d2125478647a8fab5afef4aceb3d0d5a3c6a1b5d1913379af24d6c317f188 +size 139341 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/13fdba35-9a7f-4880-8786-101d03729ae0.lance b/.lancedb/nltk_chunking_BAAI.lance/data/13fdba35-9a7f-4880-8786-101d03729ae0.lance new file mode 100644 index 0000000000000000000000000000000000000000..66f11a8d4cf0e39913c5a3a978071fc72315ecae --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/13fdba35-9a7f-4880-8786-101d03729ae0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d446427af3304e6d0838a061a63e558af75ecb26b35ae4426acfa7f9e72d4c2 +size 135872 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/145ed1be-f154-4a34-ab4a-0d1404ac24e2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/145ed1be-f154-4a34-ab4a-0d1404ac24e2.lance new file mode 100644 index 0000000000000000000000000000000000000000..1d9586a268a286a7ab0ae55f52d2455e27664b74 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/145ed1be-f154-4a34-ab4a-0d1404ac24e2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d28769beb5d410e3c617a99dd904f094aba9f035f7d0cd4af48d6bb7806751e4 +size 134778 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/14bebe1b-2b98-40e7-8f99-6cf3155076c3.lance b/.lancedb/nltk_chunking_BAAI.lance/data/14bebe1b-2b98-40e7-8f99-6cf3155076c3.lance new file mode 100644 index 0000000000000000000000000000000000000000..0edffabe7c866bdfe2a6e227d54b7dd82ad2a625 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/14bebe1b-2b98-40e7-8f99-6cf3155076c3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdc77acc50493b37f67e9d60e3ecd30de78686d693748e0c91e4846771efab68 +size 137802 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/14c5754e-a56c-4636-8f57-88b571082207.lance b/.lancedb/nltk_chunking_BAAI.lance/data/14c5754e-a56c-4636-8f57-88b571082207.lance new file mode 100644 index 0000000000000000000000000000000000000000..0f04080fba24623814a0b67f2913179af1e9bc81 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/14c5754e-a56c-4636-8f57-88b571082207.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c28da6b5a949adb47f3fcba058bdc3934f1595ccf00da191d945a3bc06e2519 +size 139749 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/15b774ba-dd48-460a-b12d-bd5177424401.lance b/.lancedb/nltk_chunking_BAAI.lance/data/15b774ba-dd48-460a-b12d-bd5177424401.lance new file mode 100644 index 0000000000000000000000000000000000000000..6cac5a30e0155bc35355163b3575805f2410abeb --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/15b774ba-dd48-460a-b12d-bd5177424401.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d626b93b658a9a3c600f2622ea0c0764efb5fdd130656c540bfac69ba7d4656d +size 136781 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/15dec6a0-8359-4715-8b7b-a830f71a97b2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/15dec6a0-8359-4715-8b7b-a830f71a97b2.lance new file mode 100644 index 0000000000000000000000000000000000000000..877d9aead5be8ca07a22ffdb9944eebbdd2b9afd --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/15dec6a0-8359-4715-8b7b-a830f71a97b2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a29a1deb8ff832e474b82e4f8617045fadb3f4a0a5ff54a4943377d684729a9f +size 136534 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/177f3f8e-6cc0-435a-9d4a-c81d8306bbd2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/177f3f8e-6cc0-435a-9d4a-c81d8306bbd2.lance new file mode 100644 index 0000000000000000000000000000000000000000..865f592c964b5c7388f5f884e9c3feb4ba5f5928 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/177f3f8e-6cc0-435a-9d4a-c81d8306bbd2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a982663cc56407440210c28915e8f5315e9c964087f793093c7c3963e98d6cc +size 135295 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/184e7cfb-a77c-4ee5-b8fd-8e5f73ddc928.lance b/.lancedb/nltk_chunking_BAAI.lance/data/184e7cfb-a77c-4ee5-b8fd-8e5f73ddc928.lance new file mode 100644 index 0000000000000000000000000000000000000000..1183547f40167f41d81b86abf7c901c1c80be170 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/184e7cfb-a77c-4ee5-b8fd-8e5f73ddc928.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b3c23ad2bdbb5dc995b2ff01d1adba9ea5584739e1751fc3a31d47d06dd9e34 +size 137792 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/19ef7fea-c525-499b-abb7-2a10ea2b0b02.lance b/.lancedb/nltk_chunking_BAAI.lance/data/19ef7fea-c525-499b-abb7-2a10ea2b0b02.lance new file mode 100644 index 0000000000000000000000000000000000000000..400e68f4c890008af4cf5927e71ac5cb0302ce9e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/19ef7fea-c525-499b-abb7-2a10ea2b0b02.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0550d3c1851a3e75e20aecaa61b70ba52b28e4290d00e09bc49d4b731e7b819f +size 135766 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/1ac80d3e-d3b9-4d13-96b5-7994200ee55a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/1ac80d3e-d3b9-4d13-96b5-7994200ee55a.lance new file mode 100644 index 0000000000000000000000000000000000000000..b6bfe94a1a9d1a223a0c5076c2116a232e32f0e4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/1ac80d3e-d3b9-4d13-96b5-7994200ee55a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e56845cf9f6dc8ffb2882a7b739bf3237ea605576ab8359754dfbb5814c7fc23 +size 138461 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/1c148fda-05e0-43d1-be2a-4fe3dfd78e85.lance b/.lancedb/nltk_chunking_BAAI.lance/data/1c148fda-05e0-43d1-be2a-4fe3dfd78e85.lance new file mode 100644 index 0000000000000000000000000000000000000000..e8dfb2f5f2fd7af5ab31a99a6252c8c37fcebd9f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/1c148fda-05e0-43d1-be2a-4fe3dfd78e85.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5990925bc2ac526304f2e8e8bf5791a7b1187f27b4f0b5be1c4d8fe693fff238 +size 135610 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/1c3cf368-b720-472a-947b-3569db5741c2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/1c3cf368-b720-472a-947b-3569db5741c2.lance new file mode 100644 index 0000000000000000000000000000000000000000..01d8caef78e3e9888b4e0b9f3aa0e46e5b515660 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/1c3cf368-b720-472a-947b-3569db5741c2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af5fcf7dece5b1f8d9f57b15c7d1b15312fe4019ab6f8e002c47bc5295510d33 +size 140679 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/1d8769f6-bed8-45e7-944f-c33decdcb404.lance b/.lancedb/nltk_chunking_BAAI.lance/data/1d8769f6-bed8-45e7-944f-c33decdcb404.lance new file mode 100644 index 0000000000000000000000000000000000000000..8c3319fc801fa4801656494cd84aa46d248da432 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/1d8769f6-bed8-45e7-944f-c33decdcb404.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df0e3f3fe306fb2da5fbee7d9d5930b9da84a30900319ffeac3109d232a198e0 +size 139062 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/1fed2093-5fd2-440f-8595-ff272766bd21.lance b/.lancedb/nltk_chunking_BAAI.lance/data/1fed2093-5fd2-440f-8595-ff272766bd21.lance new file mode 100644 index 0000000000000000000000000000000000000000..36e33e7316038c030adbcdd52474782787fe6f45 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/1fed2093-5fd2-440f-8595-ff272766bd21.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bc572c89aa2bbbc74cc17f90c30a923d3007d265662f620418225dbca33f53c +size 139045 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/201b1bec-20de-4ef2-9917-f65a0c02f850.lance b/.lancedb/nltk_chunking_BAAI.lance/data/201b1bec-20de-4ef2-9917-f65a0c02f850.lance new file mode 100644 index 0000000000000000000000000000000000000000..9dbac5e42b035d8893ea7403df0be0725f9355be --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/201b1bec-20de-4ef2-9917-f65a0c02f850.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90837357b403af02ae33f3ea6b150aca46ee561e8053f17f7a0572bbc4ed06e9 +size 137776 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/203905ab-ecd2-4257-9c5c-0902591c8229.lance b/.lancedb/nltk_chunking_BAAI.lance/data/203905ab-ecd2-4257-9c5c-0902591c8229.lance new file mode 100644 index 0000000000000000000000000000000000000000..49ce0ff74e7bb1ec061fadc390627f03487c40fc --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/203905ab-ecd2-4257-9c5c-0902591c8229.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da2b7d8ffa72b1d72bded16323a4ffb8e6ca82cfd9a69c1aa6f839bc9505b522 +size 136889 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/2197a501-5eda-499e-a3b6-7b707a940dbb.lance b/.lancedb/nltk_chunking_BAAI.lance/data/2197a501-5eda-499e-a3b6-7b707a940dbb.lance new file mode 100644 index 0000000000000000000000000000000000000000..391051422cda7210de847b342a0396e0107f9f92 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/2197a501-5eda-499e-a3b6-7b707a940dbb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7230b3327ee08a44d9515a6dc456dbf840aa216636e84e9d21021f4078050b8c +size 136184 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/21aae70e-da06-4300-8d97-df2712d1b656.lance b/.lancedb/nltk_chunking_BAAI.lance/data/21aae70e-da06-4300-8d97-df2712d1b656.lance new file mode 100644 index 0000000000000000000000000000000000000000..7f299d37eeefb870cde2ce4901cb53682321b43c --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/21aae70e-da06-4300-8d97-df2712d1b656.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c009ef004e2cb7f26e0fcad920fc18135bc85d9af995966b39ebd5994c3b5025 +size 138086 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/22189d0d-aad8-4905-8953-c118c353eea2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/22189d0d-aad8-4905-8953-c118c353eea2.lance new file mode 100644 index 0000000000000000000000000000000000000000..b0dbdf7e0f95769e3741a2ffeaadfcf97e60960f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/22189d0d-aad8-4905-8953-c118c353eea2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:003c0ba59dd85655d445e73e52ddc0ced0358e4e546fc160f4088aa091353779 +size 139935 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/227c77d1-7905-4fb2-a66b-2eee944de5eb.lance b/.lancedb/nltk_chunking_BAAI.lance/data/227c77d1-7905-4fb2-a66b-2eee944de5eb.lance new file mode 100644 index 0000000000000000000000000000000000000000..b3fc47a16e955c900074d8e3e52e626ace696be7 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/227c77d1-7905-4fb2-a66b-2eee944de5eb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ecf4890940e2c9c0ffac7d5c20ab9c947d1b8892e1e816f130e766ce88a5c7c +size 137503 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/227d3781-6f98-4476-95b1-096643e3d750.lance b/.lancedb/nltk_chunking_BAAI.lance/data/227d3781-6f98-4476-95b1-096643e3d750.lance new file mode 100644 index 0000000000000000000000000000000000000000..2f684b0784d15767e829cac355e141b5c96afba6 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/227d3781-6f98-4476-95b1-096643e3d750.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:191092e04745007165e50f5ed6daa7ca93dc27a0ed226b3d048b48375a91d0b2 +size 137905 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/22d0a2e8-b9fb-425b-a2fb-339ce90361a3.lance b/.lancedb/nltk_chunking_BAAI.lance/data/22d0a2e8-b9fb-425b-a2fb-339ce90361a3.lance new file mode 100644 index 0000000000000000000000000000000000000000..e693972fd9f7e5050af4433ab83580e41a7b3497 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/22d0a2e8-b9fb-425b-a2fb-339ce90361a3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08c1681a42f69ecd955ca37b5e1276ca8699ef4a60502457675df31aee994acc +size 137555 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/22dbfcd8-5f37-4765-bc76-de5ace7aab0c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/22dbfcd8-5f37-4765-bc76-de5ace7aab0c.lance new file mode 100644 index 0000000000000000000000000000000000000000..ffee96a9ae849bf7b3326789deabaa9e7bcbd1cb --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/22dbfcd8-5f37-4765-bc76-de5ace7aab0c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:197d3c9d4a25efa5a7cb24485beaee8c189ef5c288bcc7a7d5f2144960632f3d +size 138703 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/23c8c665-6e1b-4625-83e1-8e2e1187a703.lance b/.lancedb/nltk_chunking_BAAI.lance/data/23c8c665-6e1b-4625-83e1-8e2e1187a703.lance new file mode 100644 index 0000000000000000000000000000000000000000..d99b83553ad96df3c18ede0c7f56d25d51ffed39 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/23c8c665-6e1b-4625-83e1-8e2e1187a703.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42cdbda666bf1e37cdfc70962b9cd52ce56348656091a3afc68191d7fd94ede4 +size 143749 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/26488a9d-da0c-422f-8ffa-62a18a791f6d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/26488a9d-da0c-422f-8ffa-62a18a791f6d.lance new file mode 100644 index 0000000000000000000000000000000000000000..4f95feacae615929bc0ad570cb1a3e3d837e7a12 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/26488a9d-da0c-422f-8ffa-62a18a791f6d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a570e290852781e1ccd32ad895fdd7db97006002ff15cf0c75052b8215b6f1a +size 141297 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/27326f4c-4884-462d-90f9-584c6ac56184.lance b/.lancedb/nltk_chunking_BAAI.lance/data/27326f4c-4884-462d-90f9-584c6ac56184.lance new file mode 100644 index 0000000000000000000000000000000000000000..243f986a7bbd35e59fd61b03763eebc78b953e88 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/27326f4c-4884-462d-90f9-584c6ac56184.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39432785142b7d151a7be84d7629146cf596d06c3dc5586d8f7e0165e2cab9c8 +size 135843 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/2b3b48bd-0766-4744-9d7c-ccf5425bc908.lance b/.lancedb/nltk_chunking_BAAI.lance/data/2b3b48bd-0766-4744-9d7c-ccf5425bc908.lance new file mode 100644 index 0000000000000000000000000000000000000000..1e468645a2d5a25ccb3998aeecae0f32949994e3 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/2b3b48bd-0766-4744-9d7c-ccf5425bc908.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c39257a923a2de6b93c8bb28e5c3c8bf5474ef57b2c6e1d915c6b155f8a3e3b9 +size 136259 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/2c297333-f521-4ffa-ba45-0f3e08c697b3.lance b/.lancedb/nltk_chunking_BAAI.lance/data/2c297333-f521-4ffa-ba45-0f3e08c697b3.lance new file mode 100644 index 0000000000000000000000000000000000000000..cfe5009d091316cc2c2e0fbee4fbd7c2804402f1 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/2c297333-f521-4ffa-ba45-0f3e08c697b3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcd82d801584d3db4b395c61f4cd51370df266429e1dfde703b830143ca0d053 +size 140638 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/2c6b694b-021d-4a8e-bfe1-cd4ba009536b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/2c6b694b-021d-4a8e-bfe1-cd4ba009536b.lance new file mode 100644 index 0000000000000000000000000000000000000000..65a3f1c6cec208233a594a75b86b508875787874 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/2c6b694b-021d-4a8e-bfe1-cd4ba009536b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd40440ddc82cc80e7414ae328ad3132d4e4debbe4994e4eacabd29ff370ac62 +size 137472 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/2c79dd34-19c9-4e29-ac64-858526728d5a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/2c79dd34-19c9-4e29-ac64-858526728d5a.lance new file mode 100644 index 0000000000000000000000000000000000000000..35f5d933af535d7d9023d29406b025b970fc0b8f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/2c79dd34-19c9-4e29-ac64-858526728d5a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20f4a99098b79eecf9c8c075e42a4a9743f2e89eedf8ea6a50c0a10beea66704 +size 139794 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/2d663b20-a6db-4c88-bc62-be1b6ab745c7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/2d663b20-a6db-4c88-bc62-be1b6ab745c7.lance new file mode 100644 index 0000000000000000000000000000000000000000..fde47ba567392b995c08a4d012a0e98a35135ee4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/2d663b20-a6db-4c88-bc62-be1b6ab745c7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:028c5cb9ae99014261df2678e8b855f634741916e4fe976da0de0408ba08d7d3 +size 137143 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/2dd66b99-a3ce-4faf-bf9b-ef975f74f8d8.lance b/.lancedb/nltk_chunking_BAAI.lance/data/2dd66b99-a3ce-4faf-bf9b-ef975f74f8d8.lance new file mode 100644 index 0000000000000000000000000000000000000000..8d35e133041acc85e2b8a3e1843a4e5db79564c5 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/2dd66b99-a3ce-4faf-bf9b-ef975f74f8d8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b35ebeca4363e70e93f3b17be7dd39a2fb258c353932d8be54358fd43b3bab87 +size 138005 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/2e6abe35-43f9-4bf9-84a9-99ad3a9d508d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/2e6abe35-43f9-4bf9-84a9-99ad3a9d508d.lance new file mode 100644 index 0000000000000000000000000000000000000000..b08e05fb3f441cfb69dc5ca34e1d60daaabb6d31 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/2e6abe35-43f9-4bf9-84a9-99ad3a9d508d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edede02223c08fef83affbdf0e56782fb6dabde365f48d0916d08b2e16cef553 +size 140034 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/2f3578a2-318a-42a1-8fc1-2458813b74ea.lance b/.lancedb/nltk_chunking_BAAI.lance/data/2f3578a2-318a-42a1-8fc1-2458813b74ea.lance new file mode 100644 index 0000000000000000000000000000000000000000..0a2d88511b1828af157e6c0370437d006083127c --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/2f3578a2-318a-42a1-8fc1-2458813b74ea.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:549e90a5235dbabbd791972c84799378e1af13c3fae76fd87d2f92796729ab72 +size 135065 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/316c9fc0-43d7-499a-b238-5b9d796b56ae.lance b/.lancedb/nltk_chunking_BAAI.lance/data/316c9fc0-43d7-499a-b238-5b9d796b56ae.lance new file mode 100644 index 0000000000000000000000000000000000000000..395dd1f616b6f18c8c363aa173784de1d47e345e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/316c9fc0-43d7-499a-b238-5b9d796b56ae.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f607d4dbba6ae101b7762e5c28dcb77d15fbf4eddb40f513022e3544d41fb796 +size 135442 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/323a815e-29eb-4bf9-b6f1-2b624dd24e8e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/323a815e-29eb-4bf9-b6f1-2b624dd24e8e.lance new file mode 100644 index 0000000000000000000000000000000000000000..164c3e0b876f636826abf0c31aae5b5a41f21e9b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/323a815e-29eb-4bf9-b6f1-2b624dd24e8e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5cb54089d2baba23c73b341c776bc19aaa4a66991fe53ef77f121ecd89670b6 +size 136917 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3243619a-fe6f-4227-8403-7a605f2e1357.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3243619a-fe6f-4227-8403-7a605f2e1357.lance new file mode 100644 index 0000000000000000000000000000000000000000..ab67be0488bcd9c4487f36b47291814ae433f1d5 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3243619a-fe6f-4227-8403-7a605f2e1357.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:341d614f8503e8f1960f0332c62607eb1b435f7d508592d21fb8c4e82860efa9 +size 136157 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/330da0e4-e2e0-4ad3-8d0b-4fc2a7c9f553.lance b/.lancedb/nltk_chunking_BAAI.lance/data/330da0e4-e2e0-4ad3-8d0b-4fc2a7c9f553.lance new file mode 100644 index 0000000000000000000000000000000000000000..fa4b7a41f467b3e75205d7e1de7d3e94c8e6a101 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/330da0e4-e2e0-4ad3-8d0b-4fc2a7c9f553.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:379aba4eea368bdd615621f70937da0cd14c57dc4477be1a908c4063f8ad94f2 +size 141024 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/332314cd-2823-4f3c-acd4-f59ea70e1c4b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/332314cd-2823-4f3c-acd4-f59ea70e1c4b.lance new file mode 100644 index 0000000000000000000000000000000000000000..6a9df9e006bfedf023e83ae29a5a21a5989ac1ca --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/332314cd-2823-4f3c-acd4-f59ea70e1c4b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a24d51723ccc694b99eaca07c7b8c216506108a3a5fb8c44b9db41d876a04194 +size 140826 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3343e05d-a56a-40bc-bfa4-52f94397238e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3343e05d-a56a-40bc-bfa4-52f94397238e.lance new file mode 100644 index 0000000000000000000000000000000000000000..751a24293493267c0e2a602ad81f0edccfac17df --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3343e05d-a56a-40bc-bfa4-52f94397238e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8542ff4858f15970733e72a193165b74fd2c4f25649020084a62e014b9e9d711 +size 138695 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/33d1bf17-d8db-4018-aa6a-7f909fe55a77.lance b/.lancedb/nltk_chunking_BAAI.lance/data/33d1bf17-d8db-4018-aa6a-7f909fe55a77.lance new file mode 100644 index 0000000000000000000000000000000000000000..afed8a0baaf6e45a46cee6154c47c42fdd0fa198 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/33d1bf17-d8db-4018-aa6a-7f909fe55a77.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd999355c6489e76c141406a7973aabf2bed55679636ef6ddca6abb6d12eaff0 +size 136371 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/33ef102b-bf9e-40f1-86f2-e2ebf6f1515f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/33ef102b-bf9e-40f1-86f2-e2ebf6f1515f.lance new file mode 100644 index 0000000000000000000000000000000000000000..6a9e0b9f19be9cd3315bef5f9a391b416e08a744 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/33ef102b-bf9e-40f1-86f2-e2ebf6f1515f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4acff189bd16a588bc767fcef6d44d04738ce0d407f58b5c7a6ca5550f90b3e3 +size 139057 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/34b7e1af-c748-465a-84fe-2046e378db7f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/34b7e1af-c748-465a-84fe-2046e378db7f.lance new file mode 100644 index 0000000000000000000000000000000000000000..7b1970f40f9a437352f1046a3c8e42f17124e2ed --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/34b7e1af-c748-465a-84fe-2046e378db7f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e67c4bfed8ea8413bd2cce28a6a5696202112eddbfcc32c710bd496b8f4d23 +size 136577 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/36903631-b5a0-4979-a4a1-76ea6f660a36.lance b/.lancedb/nltk_chunking_BAAI.lance/data/36903631-b5a0-4979-a4a1-76ea6f660a36.lance new file mode 100644 index 0000000000000000000000000000000000000000..816807b0e1d3339aaee216be61a2ff16ec91d53c --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/36903631-b5a0-4979-a4a1-76ea6f660a36.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd368dd20d1d4cde91ee033b0ec77dbcf9d7085d9060595cee99d8ec9dcf4b2f +size 135149 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/38d22ed9-5b2c-4d35-9a19-a00e43021e6f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/38d22ed9-5b2c-4d35-9a19-a00e43021e6f.lance new file mode 100644 index 0000000000000000000000000000000000000000..e88a776bc8b6fe41dd51415b8a7e728cd49195b9 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/38d22ed9-5b2c-4d35-9a19-a00e43021e6f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74d6708b88b29f9a8e386510169fd539cec86094872fd9c2eedd319ef9c8f612 +size 136283 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3971538e-adca-4ff5-9670-0927fce84df9.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3971538e-adca-4ff5-9670-0927fce84df9.lance new file mode 100644 index 0000000000000000000000000000000000000000..d4305fe3a9a817e9fb9ae3b95915830bda95523e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3971538e-adca-4ff5-9670-0927fce84df9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c3d15479c15616b9f21e9f8e4e2b751a57658fdbcff38dbb85aa1977939bc47 +size 139603 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/39fe2819-47c5-4a71-b390-e79bd08708a4.lance b/.lancedb/nltk_chunking_BAAI.lance/data/39fe2819-47c5-4a71-b390-e79bd08708a4.lance new file mode 100644 index 0000000000000000000000000000000000000000..49194c5034ff34cbc4d8b411b07a47e7064789ce --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/39fe2819-47c5-4a71-b390-e79bd08708a4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d53f083726980fd3a815bd0d1474221e33307c428e30ffeca7a6f6bcde9ded +size 138324 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3a4ccbe4-2eca-4c77-b7fa-f715f74bc9f2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3a4ccbe4-2eca-4c77-b7fa-f715f74bc9f2.lance new file mode 100644 index 0000000000000000000000000000000000000000..b018dfa9c09bdfd7cdd144380bde6859ce85b575 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3a4ccbe4-2eca-4c77-b7fa-f715f74bc9f2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2703214c9f5e6119fe1ab6d6491d9d6d4051b9e75bae8a44006f465ae6f56e71 +size 137140 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3b19bfba-e638-4c33-89b6-9ec4b709579b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3b19bfba-e638-4c33-89b6-9ec4b709579b.lance new file mode 100644 index 0000000000000000000000000000000000000000..d3bbe92a51fc97882120332bf6aa1a7b460ebcae --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3b19bfba-e638-4c33-89b6-9ec4b709579b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:431cf25bb550b6b231ee527f07713c2fb4f17fbcdb11f40b2a5a5a092e548705 +size 136507 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3b2fa18d-f1ad-486d-aec8-15b15679c0f4.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3b2fa18d-f1ad-486d-aec8-15b15679c0f4.lance new file mode 100644 index 0000000000000000000000000000000000000000..1b3ff84a87a2a801fc0268ff44dc794c0252f599 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3b2fa18d-f1ad-486d-aec8-15b15679c0f4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78d16dacfe930117b265df159e50742a983c0da23393a198ff3c34440b075329 +size 136834 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3bbe79da-b9f4-4357-b79c-7eba6a67de07.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3bbe79da-b9f4-4357-b79c-7eba6a67de07.lance new file mode 100644 index 0000000000000000000000000000000000000000..b4f608bda2f1b33b527e639fba6ced37f6162ea4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3bbe79da-b9f4-4357-b79c-7eba6a67de07.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12cc1be62345f82258a63472fbca3fe14f36d47c928389c15b0f1e6d05169aab +size 135349 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3bf9681c-e942-41a7-9ba3-d053ee47e8e1.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3bf9681c-e942-41a7-9ba3-d053ee47e8e1.lance new file mode 100644 index 0000000000000000000000000000000000000000..a76b08baf1a6143613d62ccff4b66280d8b0aca3 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3bf9681c-e942-41a7-9ba3-d053ee47e8e1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b41387a235fe02131d01258d511167be176be8243b485a00e632fc08762b98 +size 139690 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3cc580f5-c328-444d-9c3d-3bba2e649ba4.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3cc580f5-c328-444d-9c3d-3bba2e649ba4.lance new file mode 100644 index 0000000000000000000000000000000000000000..11a4fc79a1fcc735fc42a9f3925b418e00980a0e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3cc580f5-c328-444d-9c3d-3bba2e649ba4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff68ca6e0fd13e46bff0aa03f47f2cce3c4f3f1231e8e5c38deb00ee8dfa67eb +size 136098 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3d4e5fcd-ee4f-498d-b3ee-0b84102a57ce.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3d4e5fcd-ee4f-498d-b3ee-0b84102a57ce.lance new file mode 100644 index 0000000000000000000000000000000000000000..33f548e9a9bbe217f28b6a8bf9cedd06234a0bfb --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3d4e5fcd-ee4f-498d-b3ee-0b84102a57ce.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d196f846b3fffb7c98c324ae0e436dddd0748c99ce844f2215b18108f85ff4a +size 136040 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3de8dec3-40e2-4306-8315-0208eb057f18.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3de8dec3-40e2-4306-8315-0208eb057f18.lance new file mode 100644 index 0000000000000000000000000000000000000000..476b740ab636bc7df73f96fe8e9dfcdfa8e67bae --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3de8dec3-40e2-4306-8315-0208eb057f18.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:547258d4bc0a33d478a08628c7cf70366b1e6635184fa65899c036e932aa670e +size 139423 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3deac73e-7c00-4313-98aa-0ff6c1a3d415.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3deac73e-7c00-4313-98aa-0ff6c1a3d415.lance new file mode 100644 index 0000000000000000000000000000000000000000..45b110d15fe66ca286734fa3350c7f8c30bf9360 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3deac73e-7c00-4313-98aa-0ff6c1a3d415.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef77ce95e85169aac6de4a350e1e6f2a703341c1d4be4329d1d4499b6e8aca5f +size 136156 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3e0ba04a-fcf8-406f-9bf0-770f4d54cbdc.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3e0ba04a-fcf8-406f-9bf0-770f4d54cbdc.lance new file mode 100644 index 0000000000000000000000000000000000000000..6ea2560c85ab3493e7dc4baccf6ab2508e203df2 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3e0ba04a-fcf8-406f-9bf0-770f4d54cbdc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f58eddb618738700c08a93506e3755cc3f6e327fa7b9cac5fc36100068e755ec +size 136104 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3e2a4948-7db4-4202-a5f3-063e3029e550.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3e2a4948-7db4-4202-a5f3-063e3029e550.lance new file mode 100644 index 0000000000000000000000000000000000000000..051f33f4ec7fd49970b130f7d4ef9e6ade5b04ab --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3e2a4948-7db4-4202-a5f3-063e3029e550.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51924f5254be5228c106e01768acef5202e4d88f7a5067caf22f692bebbcfa5b +size 137833 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3e5d2b6c-7867-4b53-8664-418b58eaaee8.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3e5d2b6c-7867-4b53-8664-418b58eaaee8.lance new file mode 100644 index 0000000000000000000000000000000000000000..3e2d243f6cdef8a4d584aa19bb8f260752135894 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3e5d2b6c-7867-4b53-8664-418b58eaaee8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2d6a0417141e6f3d1b1a62b5ed7cdd10f8a6805e0c7ce13d83feef86410d636 +size 145764 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/3ff5f751-5bfd-44d9-a43b-2325be4dd2bd.lance b/.lancedb/nltk_chunking_BAAI.lance/data/3ff5f751-5bfd-44d9-a43b-2325be4dd2bd.lance new file mode 100644 index 0000000000000000000000000000000000000000..ac11762c1064e76c848a48858a52c19101547bca --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/3ff5f751-5bfd-44d9-a43b-2325be4dd2bd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b441db9f475ac33fab35621a0ebfa0afe3edd6691da3e4c9a90dd41eae03ecfd +size 137753 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/40d52e36-260d-4a0b-a26a-2eaca02ce832.lance b/.lancedb/nltk_chunking_BAAI.lance/data/40d52e36-260d-4a0b-a26a-2eaca02ce832.lance new file mode 100644 index 0000000000000000000000000000000000000000..22d0318a86d23c65fd4dfdbbbb046b7b94ba258a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/40d52e36-260d-4a0b-a26a-2eaca02ce832.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47419ea133810a1ad894e1a2416bca76418b174c47ef8893a6e8bca4b8233457 +size 135740 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/416cac82-52de-49fc-a073-a108a971c5fe.lance b/.lancedb/nltk_chunking_BAAI.lance/data/416cac82-52de-49fc-a073-a108a971c5fe.lance new file mode 100644 index 0000000000000000000000000000000000000000..d856722896c302fe5bf640fd393da440fd2a5913 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/416cac82-52de-49fc-a073-a108a971c5fe.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c111c8ac946d8dfd94ade23fbee4e2fb3272fc0561cc66b6b3829aca032e63ac +size 135970 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/41e7156a-9dc5-4a24-aa58-3d6ec331304f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/41e7156a-9dc5-4a24-aa58-3d6ec331304f.lance new file mode 100644 index 0000000000000000000000000000000000000000..77f005f5f6e72a592b968646583294410788c869 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/41e7156a-9dc5-4a24-aa58-3d6ec331304f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bb28cbc6bf019a23e1f847337161584909d1646c872959c2cb64f1fc693c396 +size 139482 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/42dba205-7836-4902-b338-b29fd5aa5c1b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/42dba205-7836-4902-b338-b29fd5aa5c1b.lance new file mode 100644 index 0000000000000000000000000000000000000000..1648daa52192a5359e6b9703414e7609c0d81fb1 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/42dba205-7836-4902-b338-b29fd5aa5c1b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:102003503d46b4f2d474c6f261b8355d3ace330be481009bba214095aec44b9f +size 138373 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/431046a6-4af6-4804-b80c-82cdd7d741fa.lance b/.lancedb/nltk_chunking_BAAI.lance/data/431046a6-4af6-4804-b80c-82cdd7d741fa.lance new file mode 100644 index 0000000000000000000000000000000000000000..e26a6d7ac4d6a9716db2de2c50ec622de9876f60 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/431046a6-4af6-4804-b80c-82cdd7d741fa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c3bdf47fa3fe956b7186ccb099fb9a6731973f0ce06e24b8a27bb0cb1c23bba +size 138337 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4422b33f-a973-475b-a2de-707de7630bde.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4422b33f-a973-475b-a2de-707de7630bde.lance new file mode 100644 index 0000000000000000000000000000000000000000..9d0a661889e28170ec970d52dd59937e4595be45 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4422b33f-a973-475b-a2de-707de7630bde.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d7070ff5dc2c242a35b494b0577e5bcb1d14254478d2c4e9df0a08c61292493 +size 140497 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/44713448-9b9a-4802-b984-6d70e384df6a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/44713448-9b9a-4802-b984-6d70e384df6a.lance new file mode 100644 index 0000000000000000000000000000000000000000..b57e067ce42db60f63cec22933aebcc6156202db --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/44713448-9b9a-4802-b984-6d70e384df6a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dae4061223a501a6226c163e86385753a2a7e8a066b7b6243cf0ede76188551 +size 135950 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/456223dd-a928-42ca-a086-1f438c6312af.lance b/.lancedb/nltk_chunking_BAAI.lance/data/456223dd-a928-42ca-a086-1f438c6312af.lance new file mode 100644 index 0000000000000000000000000000000000000000..df6342b147043d1f669034abc74a41318a40a88b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/456223dd-a928-42ca-a086-1f438c6312af.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4df384dd1d85587b39a601c44edf5b6c6c7f2acbc0db14c6270af7095d6c4ba9 +size 137913 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/47d1b3ec-5fb2-4718-86af-4f43a2f9b498.lance b/.lancedb/nltk_chunking_BAAI.lance/data/47d1b3ec-5fb2-4718-86af-4f43a2f9b498.lance new file mode 100644 index 0000000000000000000000000000000000000000..db1a2fe8983a6b1bf2e858c4f5e03181300292b4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/47d1b3ec-5fb2-4718-86af-4f43a2f9b498.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d789037a164340af53e0943b2d1d0d8c7c45cf3b126be62a98f93dfb36ee97b +size 137902 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/48e83aac-905f-49af-be8f-0d71e366922c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/48e83aac-905f-49af-be8f-0d71e366922c.lance new file mode 100644 index 0000000000000000000000000000000000000000..990933a7e8d7c30a80b3ca7fe552244ecfac03d2 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/48e83aac-905f-49af-be8f-0d71e366922c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60354ef63c9e92d1a98c773cff77dbd5a8fda3430f17670c8fb063c55acda5ae +size 136998 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/49aa78ef-7941-4272-b45a-78933a3c3763.lance b/.lancedb/nltk_chunking_BAAI.lance/data/49aa78ef-7941-4272-b45a-78933a3c3763.lance new file mode 100644 index 0000000000000000000000000000000000000000..da5cd90f6ff1e466e78d68e6e2095939842f16fe --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/49aa78ef-7941-4272-b45a-78933a3c3763.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4741c2cc839229e546b197ebbadb3094e73fd233396d0f38f26da1a25f34ac1 +size 137540 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4a654336-121d-4ad2-bbf4-aeeeebd77542.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4a654336-121d-4ad2-bbf4-aeeeebd77542.lance new file mode 100644 index 0000000000000000000000000000000000000000..fdefc5f9bee271c2009fb20f6c7b74b0f55da5bc --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4a654336-121d-4ad2-bbf4-aeeeebd77542.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:327d434a2ad3b08690d752c62a74969136915f7e7fd497984d545c163d5e7142 +size 136911 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4a698204-8a5b-45a2-bc7b-8bef9894cd17.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4a698204-8a5b-45a2-bc7b-8bef9894cd17.lance new file mode 100644 index 0000000000000000000000000000000000000000..529068d89865e413e14f0c7bf7d7559b0509ab1d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4a698204-8a5b-45a2-bc7b-8bef9894cd17.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c18d31a42eb4ce2536069823617d971a537424c819d0ae4f6d9c0a86f7ab804e +size 138413 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4be00598-2b03-4717-9568-4723de247121.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4be00598-2b03-4717-9568-4723de247121.lance new file mode 100644 index 0000000000000000000000000000000000000000..baee8f6f1e7588ea5e0e8993cdaabd6bfae10319 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4be00598-2b03-4717-9568-4723de247121.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bef8b443a0f409630f1d1b525c02c53b3835e48f7497713f27fbde8d6ac1775c +size 136548 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4c2d1b2c-b730-4831-8346-ae4e78c2af61.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4c2d1b2c-b730-4831-8346-ae4e78c2af61.lance new file mode 100644 index 0000000000000000000000000000000000000000..b4cc230d2decedaab38c4f8c9794b70c87a6e486 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4c2d1b2c-b730-4831-8346-ae4e78c2af61.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:024c6e6df4c43ab2bf87c3617f49191d6f92d08ddf0e6aaa0bca6472e2026769 +size 137248 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4c9a4119-3c4a-4b99-a238-087d30a6950b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4c9a4119-3c4a-4b99-a238-087d30a6950b.lance new file mode 100644 index 0000000000000000000000000000000000000000..78df513572b1e132534e498c96aa8e4b99d7eafb --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4c9a4119-3c4a-4b99-a238-087d30a6950b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c19ebe449cde49507c5bd04fca382fb12c48129e2f67c980cd14bcee3800ce57 +size 137092 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4d60156b-591f-4ff4-8586-20f676539e07.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4d60156b-591f-4ff4-8586-20f676539e07.lance new file mode 100644 index 0000000000000000000000000000000000000000..5653de0ef387065766d872e7882ad16dc819d867 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4d60156b-591f-4ff4-8586-20f676539e07.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56541fb4624b56a5276c3b812c9a6c02e08af5090f8d80d484d24143e9512cd3 +size 83415 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4d97eff2-930e-453f-80b2-d38e0b5f2d01.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4d97eff2-930e-453f-80b2-d38e0b5f2d01.lance new file mode 100644 index 0000000000000000000000000000000000000000..09b5261cf195ec1946e95ca0f42bffe7d10047fb --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4d97eff2-930e-453f-80b2-d38e0b5f2d01.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:862cc70c298536096883a5754caaa54b84665f2a50dc2c81bb0cd49471b49fb3 +size 139468 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4db1a617-7ea5-42fe-9a7b-937eb085c596.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4db1a617-7ea5-42fe-9a7b-937eb085c596.lance new file mode 100644 index 0000000000000000000000000000000000000000..975096f99d7d42f936f26ba528e08db9df2a322b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4db1a617-7ea5-42fe-9a7b-937eb085c596.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11005b8f1a38294b378c59e55e43e30497c5f985858b4c37e47ae7440e180b4a +size 137094 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4dd2395a-ee95-4963-b43b-89ed409dd6e8.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4dd2395a-ee95-4963-b43b-89ed409dd6e8.lance new file mode 100644 index 0000000000000000000000000000000000000000..be19ba7ffb449c3f469fe9f8c3a0c3de328e92ef --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4dd2395a-ee95-4963-b43b-89ed409dd6e8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a37b84ad6356450f366896e43bd6f3550a56f8987e58395dfbf5f59317e585c1 +size 137293 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4eabbe82-2a23-44d7-87cb-ae2673983672.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4eabbe82-2a23-44d7-87cb-ae2673983672.lance new file mode 100644 index 0000000000000000000000000000000000000000..1a724bb1c12bb53404427f5b8a754e90a280ef0b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4eabbe82-2a23-44d7-87cb-ae2673983672.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb344f0adb69f11c40ec36f92b4f6e4a053a3a2a2872161f3152e0adb9b6204f +size 139085 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4ed98e3a-ab9a-4f62-ab0a-37d4e765d60a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4ed98e3a-ab9a-4f62-ab0a-37d4e765d60a.lance new file mode 100644 index 0000000000000000000000000000000000000000..b42e72fc0e03cbfcd54727b93588fd1adabe7eb4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4ed98e3a-ab9a-4f62-ab0a-37d4e765d60a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:042a89fd93f19f07ccbb71986fb73a758b57381601f5cd465ebf4e36ee2237e6 +size 141180 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4edd812b-e437-4075-8248-e52589c241bf.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4edd812b-e437-4075-8248-e52589c241bf.lance new file mode 100644 index 0000000000000000000000000000000000000000..206006dfcccee5d42234771cc1a549efc9bd6ded --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4edd812b-e437-4075-8248-e52589c241bf.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ef2bd471a85d52e2521ba6c514cdf59c2cdc71b30eea0334fa8c34e070f275d +size 136713 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/4ef762d9-9b72-476d-b8a3-1e2fe7cf5b9a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/4ef762d9-9b72-476d-b8a3-1e2fe7cf5b9a.lance new file mode 100644 index 0000000000000000000000000000000000000000..aa301a92a5af611db6f90ec02d65839bf70a60f1 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/4ef762d9-9b72-476d-b8a3-1e2fe7cf5b9a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d242966707f501e92c19e0538bdbe556bedacb5ca6dd76cd321ae500d5a77ff +size 139166 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/51667b44-271e-4005-874f-5fd20dfdb76f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/51667b44-271e-4005-874f-5fd20dfdb76f.lance new file mode 100644 index 0000000000000000000000000000000000000000..ad721d8cfe3b006700ed0d9e428fe73831989ad2 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/51667b44-271e-4005-874f-5fd20dfdb76f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec87f0298460e914f2219c7d322de937281497320969aec347c46de9e40a59fb +size 139232 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/51c1c702-514c-4b8b-aa37-821441a51727.lance b/.lancedb/nltk_chunking_BAAI.lance/data/51c1c702-514c-4b8b-aa37-821441a51727.lance new file mode 100644 index 0000000000000000000000000000000000000000..7fb6d2e8e96ec521870bc83e95f121ba1566e444 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/51c1c702-514c-4b8b-aa37-821441a51727.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8ff54e126c6cff2536c19db56469bd57003f4f713d6c5e1b209be06239a68fa +size 137140 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/52f1a862-b8f8-468d-b36b-d1546a3bfe09.lance b/.lancedb/nltk_chunking_BAAI.lance/data/52f1a862-b8f8-468d-b36b-d1546a3bfe09.lance new file mode 100644 index 0000000000000000000000000000000000000000..5c20d2dd8fe08322edb60c9c35f3cd28666945dc --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/52f1a862-b8f8-468d-b36b-d1546a3bfe09.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85da860ea1f57a52db33e62e4a39d238729aa65c5c9d50b4bb18fcafe6ae7485 +size 136244 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/532b1fbd-f78c-432f-9482-e382b7bada6f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/532b1fbd-f78c-432f-9482-e382b7bada6f.lance new file mode 100644 index 0000000000000000000000000000000000000000..c84e08525a111f551cf1687e0608e25d767af6ed --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/532b1fbd-f78c-432f-9482-e382b7bada6f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77b587730ee3a3b5e4c6767eb76a6ff00b7ffd6bb89c669acf761d78fd3cdc2c +size 136498 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/53fef0f5-1019-4b18-b25e-ad49655195af.lance b/.lancedb/nltk_chunking_BAAI.lance/data/53fef0f5-1019-4b18-b25e-ad49655195af.lance new file mode 100644 index 0000000000000000000000000000000000000000..548cb4c41f394781661f050f624b7ef3b498f778 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/53fef0f5-1019-4b18-b25e-ad49655195af.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43f4b61ca52f1f074a476a1b29200d498df3510e4e2ea89129ca872e9221cc1e +size 136291 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/54aed152-8374-49f0-a08a-0b52c3e28e37.lance b/.lancedb/nltk_chunking_BAAI.lance/data/54aed152-8374-49f0-a08a-0b52c3e28e37.lance new file mode 100644 index 0000000000000000000000000000000000000000..2cfa7a23f2f904673777ff636b90069c401ed586 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/54aed152-8374-49f0-a08a-0b52c3e28e37.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d91fba589acb09fecec247656a827af5c514416d08230e8b400000e5f31f130 +size 136383 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/583bb789-187a-4f70-b885-ce0701e016b5.lance b/.lancedb/nltk_chunking_BAAI.lance/data/583bb789-187a-4f70-b885-ce0701e016b5.lance new file mode 100644 index 0000000000000000000000000000000000000000..107a664e02509863a0cd5c6112e0790a1b972261 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/583bb789-187a-4f70-b885-ce0701e016b5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8208abf12e7f0b551e9b5ea82fa00996dc0db43ae4ded6a84e9ef8f7af17efd7 +size 138373 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/58b4e412-7bbf-412e-854a-cf853516cafc.lance b/.lancedb/nltk_chunking_BAAI.lance/data/58b4e412-7bbf-412e-854a-cf853516cafc.lance new file mode 100644 index 0000000000000000000000000000000000000000..b028f3d17b9423053403bec27848a801832b82ff --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/58b4e412-7bbf-412e-854a-cf853516cafc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94e8f283dbff9e4c73db6dc928a68f3a094f2ee8751f7f689a7abb2787b4bdf9 +size 135853 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5a05d23f-a1bc-4836-885a-7b29d87ab3f1.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5a05d23f-a1bc-4836-885a-7b29d87ab3f1.lance new file mode 100644 index 0000000000000000000000000000000000000000..27f139b67ae17610ebfc1357271e070e48ead8f7 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5a05d23f-a1bc-4836-885a-7b29d87ab3f1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c79c5c65f55b4d924000c6912b76611fde945e7c85a7dbac894b151b491e3f0 +size 140800 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5a113382-c1e2-409f-9f0a-3110ce72944e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5a113382-c1e2-409f-9f0a-3110ce72944e.lance new file mode 100644 index 0000000000000000000000000000000000000000..fe69a0b5d03e510d4eee9a5d11a2521c55f613fe --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5a113382-c1e2-409f-9f0a-3110ce72944e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ae8a315c75f7712d96b2449ce49c959381de7de075136865857fca243cffd13 +size 136452 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5a11b210-4859-4cad-bf3e-50f0dd9f0aa6.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5a11b210-4859-4cad-bf3e-50f0dd9f0aa6.lance new file mode 100644 index 0000000000000000000000000000000000000000..b005aecbaad8c051082a403540ea54bf7b4622e7 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5a11b210-4859-4cad-bf3e-50f0dd9f0aa6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14f6dca437633c7948e1a024466107986e35b26dd9ad075225799d9ea1bb0e8c +size 138220 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5a2a2923-6182-446e-8fc6-dd664db5d721.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5a2a2923-6182-446e-8fc6-dd664db5d721.lance new file mode 100644 index 0000000000000000000000000000000000000000..fd1ca8b48711e271d77bae8f6a59d1f699aa7098 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5a2a2923-6182-446e-8fc6-dd664db5d721.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e476e0a04d93a0d036832cd406b0059968eb5ba0d3122e771fc29d2fa59fd5e +size 136648 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5a413fc9-bc95-46f0-8239-3daa63380a70.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5a413fc9-bc95-46f0-8239-3daa63380a70.lance new file mode 100644 index 0000000000000000000000000000000000000000..99e5927f6636c9549d6ace57aa0a9f0cbcc15103 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5a413fc9-bc95-46f0-8239-3daa63380a70.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59fb57ba54aa5f97a254c3276fc80b550a60a32c7ad50299821d71ebf736c6d5 +size 139687 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5a62d850-aeb5-4293-8e41-1ad405eddc08.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5a62d850-aeb5-4293-8e41-1ad405eddc08.lance new file mode 100644 index 0000000000000000000000000000000000000000..7d75e93d62d0b2dc370e5ef87df6374666862537 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5a62d850-aeb5-4293-8e41-1ad405eddc08.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bbd2264c7ee9e5021c4419fc5689b36dc6a12a98c4c89a1b45b36e7b820310b +size 135910 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5bd8f3a5-3e60-493e-b968-767c771c784a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5bd8f3a5-3e60-493e-b968-767c771c784a.lance new file mode 100644 index 0000000000000000000000000000000000000000..7bfbe66c04bec6d8baff51454d2db9608a1dacd6 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5bd8f3a5-3e60-493e-b968-767c771c784a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef5aa03d2dc46e824ad132fa2208e1c193dda917d0323c387affab6563ae9e28 +size 137644 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5c8d8570-9e5e-4a54-9f24-5289400f3699.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5c8d8570-9e5e-4a54-9f24-5289400f3699.lance new file mode 100644 index 0000000000000000000000000000000000000000..7c0962dba80716f334325b05b8fbf97462be78a0 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5c8d8570-9e5e-4a54-9f24-5289400f3699.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2bbf69c5e89500d99a350910c38062185be50db30007cc9e358c51b95c09456 +size 138610 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5ce77073-76c3-45e8-9dd5-a3ed601c5e2a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5ce77073-76c3-45e8-9dd5-a3ed601c5e2a.lance new file mode 100644 index 0000000000000000000000000000000000000000..1d0d02c66fdf4e3554b6e8abda9c7642925a4ff3 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5ce77073-76c3-45e8-9dd5-a3ed601c5e2a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb48ec3a5ec2972d27a0539225d0a4fcdc131baa407099e1539829553fb70838 +size 140015 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5cf26116-b287-4e4a-8c71-a4f70bab20ee.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5cf26116-b287-4e4a-8c71-a4f70bab20ee.lance new file mode 100644 index 0000000000000000000000000000000000000000..f8136381154318317fdb21283b5974a817744e57 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5cf26116-b287-4e4a-8c71-a4f70bab20ee.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b133cfcf9768eb2cf657d1d6d6527c5ca4c49ba78d021640985833496fee8f5 +size 136503 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5cf8d79d-4435-4827-b88d-5a315dcd9b53.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5cf8d79d-4435-4827-b88d-5a315dcd9b53.lance new file mode 100644 index 0000000000000000000000000000000000000000..e7383fbc64e121cfc0b5cdc392e92fc805172a0d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5cf8d79d-4435-4827-b88d-5a315dcd9b53.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a205309af8dbc1d5884f0adbb11760099e9c958210d90e2bd5009391fda9a45 +size 137587 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5d945a62-5d96-496f-8851-a190052cceed.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5d945a62-5d96-496f-8851-a190052cceed.lance new file mode 100644 index 0000000000000000000000000000000000000000..a6daeebd47b34f475e662ee5789ae4cfca1e7cb7 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5d945a62-5d96-496f-8851-a190052cceed.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a33d4791297dec5ab39547b93bb841da12e28d697551df23e3a3009b37e569d +size 137931 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5d9a4840-9e9d-4144-b2bd-5249275d0864.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5d9a4840-9e9d-4144-b2bd-5249275d0864.lance new file mode 100644 index 0000000000000000000000000000000000000000..3c07a6dffb3e1b61e0495f022492a243799ce938 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5d9a4840-9e9d-4144-b2bd-5249275d0864.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ea2a8ea31c597f9b1e6f81325527d12b6da003557944dd34b10fe9337c10b14 +size 137072 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5de19e38-d557-42f4-a1b9-774689e9049f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5de19e38-d557-42f4-a1b9-774689e9049f.lance new file mode 100644 index 0000000000000000000000000000000000000000..8082e45559daf8373e245342713f9ada8ca1734a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5de19e38-d557-42f4-a1b9-774689e9049f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:682d748ed80bfb991e8cc44ea5344461aa8157616080b49bf1331f244bb1e350 +size 137888 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5e279a31-05a2-454f-a50b-2a103973b78a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5e279a31-05a2-454f-a50b-2a103973b78a.lance new file mode 100644 index 0000000000000000000000000000000000000000..ac5257d4b24f81db50ea4b22b259d13d14477a56 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5e279a31-05a2-454f-a50b-2a103973b78a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74656107701189acb0c7d0b2d09748782de64864a1051a6781f8e69dca67414b +size 137410 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/5e5a92cb-bf7a-46f4-8ca1-767897a2aba5.lance b/.lancedb/nltk_chunking_BAAI.lance/data/5e5a92cb-bf7a-46f4-8ca1-767897a2aba5.lance new file mode 100644 index 0000000000000000000000000000000000000000..9d514bf2a8d1ecc9d9b877e6e3e936b2e8210b16 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/5e5a92cb-bf7a-46f4-8ca1-767897a2aba5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9beae8fd1d473cca808b8f273e5bc6f61900c7f4fdf05eddc55739c53b6f3c33 +size 138761 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/606d8ff0-b0d3-4eba-b205-01e5f17cf361.lance b/.lancedb/nltk_chunking_BAAI.lance/data/606d8ff0-b0d3-4eba-b205-01e5f17cf361.lance new file mode 100644 index 0000000000000000000000000000000000000000..000dd013b0b1fe5a21007f5994215934c4886333 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/606d8ff0-b0d3-4eba-b205-01e5f17cf361.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8be3826ccf2b016bcfc1638ad5d05c908ac3b979e305eb5360cd89a6295bfec6 +size 136624 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/60a89942-87d4-4c7e-8fd2-64f16a471b16.lance b/.lancedb/nltk_chunking_BAAI.lance/data/60a89942-87d4-4c7e-8fd2-64f16a471b16.lance new file mode 100644 index 0000000000000000000000000000000000000000..4b89a83adea76bdc09ac2f8c54b0a628d6ad8a98 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/60a89942-87d4-4c7e-8fd2-64f16a471b16.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a6b4c395db77a0f221874aea5df34230c8bf87befd67e7e00d3dbbd468e0393 +size 137831 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/60d3d04d-d7dd-4357-be0c-edea05286c27.lance b/.lancedb/nltk_chunking_BAAI.lance/data/60d3d04d-d7dd-4357-be0c-edea05286c27.lance new file mode 100644 index 0000000000000000000000000000000000000000..be864681d433a45f2c656498d392147a31d47ae4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/60d3d04d-d7dd-4357-be0c-edea05286c27.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b28398673c5c90b70211b1edcc510f8ab06d94e23b0cbfd0cd017a1dcbf3e09c +size 137577 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/61c592d9-f610-495b-87da-c55143e07b1f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/61c592d9-f610-495b-87da-c55143e07b1f.lance new file mode 100644 index 0000000000000000000000000000000000000000..ecdfb1ec0dba96a12f5675c1fec5a6b86b93855d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/61c592d9-f610-495b-87da-c55143e07b1f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d0fb315495fdeeab14faf30ba9fadd5ec14050630b0dcdc9f4e21caf0bd8682 +size 136650 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/62966ce0-01c7-4932-a0ac-d405e96c21e3.lance b/.lancedb/nltk_chunking_BAAI.lance/data/62966ce0-01c7-4932-a0ac-d405e96c21e3.lance new file mode 100644 index 0000000000000000000000000000000000000000..71ad28ac4fad085389777293dfdcd8ae6ea0ab48 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/62966ce0-01c7-4932-a0ac-d405e96c21e3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de62a1aad6d74ab09ef8b1d47531a68ac12bb34dd8ffd0be5ac67511a02af33f +size 136994 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/643286a0-c106-4dd4-85e0-2326d39c48d5.lance b/.lancedb/nltk_chunking_BAAI.lance/data/643286a0-c106-4dd4-85e0-2326d39c48d5.lance new file mode 100644 index 0000000000000000000000000000000000000000..90210f03661b071a7bb868e569eac871c102c0e3 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/643286a0-c106-4dd4-85e0-2326d39c48d5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed309f434d6784de6786a73a4cbdc31dde316b2daf13b1f8f62d1801b617cac9 +size 138889 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/655d4fd6-c642-4b54-82d2-337a59da1a39.lance b/.lancedb/nltk_chunking_BAAI.lance/data/655d4fd6-c642-4b54-82d2-337a59da1a39.lance new file mode 100644 index 0000000000000000000000000000000000000000..03c4eea1bce7cb0b31d5c9027bc0c7e109550859 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/655d4fd6-c642-4b54-82d2-337a59da1a39.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb5b5ddbd3742879ca2156871f062aec2f2001df897532edbcf04cd0a3a3893c +size 136815 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/655fe707-d707-4362-bbd6-795c8c093bf2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/655fe707-d707-4362-bbd6-795c8c093bf2.lance new file mode 100644 index 0000000000000000000000000000000000000000..473743ac5634727485097a465a74e1680180b099 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/655fe707-d707-4362-bbd6-795c8c093bf2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e28a7c1105302a7a41e129cfea54ae0809245a7cf2b4c838a7b15d465d72ad88 +size 137738 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/666d1716-ca65-4688-8c5c-996a195d1f09.lance b/.lancedb/nltk_chunking_BAAI.lance/data/666d1716-ca65-4688-8c5c-996a195d1f09.lance new file mode 100644 index 0000000000000000000000000000000000000000..f528e0ff9af3b1f0478786f0af453bca3727697d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/666d1716-ca65-4688-8c5c-996a195d1f09.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdb8c7acfcc5270e77d6566696ca52dc68dc3d509c022396f1fc09fae31c8f8f +size 137800 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/675cec8b-54c2-4c2f-81b8-7ca4e9147fbe.lance b/.lancedb/nltk_chunking_BAAI.lance/data/675cec8b-54c2-4c2f-81b8-7ca4e9147fbe.lance new file mode 100644 index 0000000000000000000000000000000000000000..d2babea9f0d3543d352ffabe659a2f4708a33542 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/675cec8b-54c2-4c2f-81b8-7ca4e9147fbe.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c913cf3331003f4554073ca94936de7f5d71df299c2b047c5e5c893a952dcdff +size 137499 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/68a45276-dc4d-48d2-b5b5-9d060a473150.lance b/.lancedb/nltk_chunking_BAAI.lance/data/68a45276-dc4d-48d2-b5b5-9d060a473150.lance new file mode 100644 index 0000000000000000000000000000000000000000..d6a92502e5b4b9b4528a2593bc25e75d834a9339 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/68a45276-dc4d-48d2-b5b5-9d060a473150.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32a12c5fd908f8afc474b6f2d8bc52af6f0eab9b84dccee5fd72860672f1e89c +size 137392 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/68b106b7-0764-45fa-996a-e305ee0f81ca.lance b/.lancedb/nltk_chunking_BAAI.lance/data/68b106b7-0764-45fa-996a-e305ee0f81ca.lance new file mode 100644 index 0000000000000000000000000000000000000000..d1e971f546ac9d16f15bd092954f20842c9f19a0 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/68b106b7-0764-45fa-996a-e305ee0f81ca.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c98c3751dd6332e1aa46b8c2afbde73faf15b8df5e366db957341d932aa230e9 +size 136552 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/68ecfc08-40a6-4e2b-8dbf-11e31c8b38e8.lance b/.lancedb/nltk_chunking_BAAI.lance/data/68ecfc08-40a6-4e2b-8dbf-11e31c8b38e8.lance new file mode 100644 index 0000000000000000000000000000000000000000..5b8a6b1d375dc96b498fc5874af4608d1c1e741f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/68ecfc08-40a6-4e2b-8dbf-11e31c8b38e8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8db8fc728f24d1af9ca38c26948c4c7117eaaaa34a2423245b1a875e5f4aa0cd +size 137929 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/6971d6e4-a174-4a5d-b536-5bdf697e76b4.lance b/.lancedb/nltk_chunking_BAAI.lance/data/6971d6e4-a174-4a5d-b536-5bdf697e76b4.lance new file mode 100644 index 0000000000000000000000000000000000000000..d6deb30107f6049f149ceb11ce7205a7ce39f964 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/6971d6e4-a174-4a5d-b536-5bdf697e76b4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57496e4e13cc6cbf0d2a4b5a0d0a9418aa6a533af29266a545b3b62df0733a51 +size 137333 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/6a29f8c7-0648-4779-968c-a6a6b00f7f38.lance b/.lancedb/nltk_chunking_BAAI.lance/data/6a29f8c7-0648-4779-968c-a6a6b00f7f38.lance new file mode 100644 index 0000000000000000000000000000000000000000..773bbff6d12d782a77c91287a188296ecbfc64e8 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/6a29f8c7-0648-4779-968c-a6a6b00f7f38.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:337a452d39c667d4054754ec2f60ba12f4bc01219803dbc49b919b0a209e6c0c +size 140904 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/6be2a8ec-e1dc-4629-8a5a-fd5544fa66d9.lance b/.lancedb/nltk_chunking_BAAI.lance/data/6be2a8ec-e1dc-4629-8a5a-fd5544fa66d9.lance new file mode 100644 index 0000000000000000000000000000000000000000..a6638d30ce3e76e1e4a587d8f208df478447abaf --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/6be2a8ec-e1dc-4629-8a5a-fd5544fa66d9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83c921321633e2e2da17067679e290f6b92048316b1d8e9ebfa2885745a2c404 +size 136762 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/6d6b8243-580a-4147-99f4-c2f786f7c300.lance b/.lancedb/nltk_chunking_BAAI.lance/data/6d6b8243-580a-4147-99f4-c2f786f7c300.lance new file mode 100644 index 0000000000000000000000000000000000000000..c5f7cd5b3adc30813fc61bf0b3f5579ad85c6262 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/6d6b8243-580a-4147-99f4-c2f786f7c300.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e1963188c0cbdb7231c54d709c1a7a67046b6b859e9c3d930cbd3bf6baca14 +size 136734 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/6d6e50de-626b-4917-9754-90f9890cefb8.lance b/.lancedb/nltk_chunking_BAAI.lance/data/6d6e50de-626b-4917-9754-90f9890cefb8.lance new file mode 100644 index 0000000000000000000000000000000000000000..0fced2b18987541d00dd3e6fba8b724dbaa77a4f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/6d6e50de-626b-4917-9754-90f9890cefb8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff8270b4bd30aad75821e0fc4882d3e46219902bf8d23caf4da2008b4563ebe3 +size 135607 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/6dbb8643-d08e-4e83-b380-7dea66347e53.lance b/.lancedb/nltk_chunking_BAAI.lance/data/6dbb8643-d08e-4e83-b380-7dea66347e53.lance new file mode 100644 index 0000000000000000000000000000000000000000..76a5d3e9e80bb7724780546e42e21631ac5e91c1 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/6dbb8643-d08e-4e83-b380-7dea66347e53.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcc543e745fa125c99e66618f847f0a5340a95b79cb10320ad6ab5dca7cc5b99 +size 136666 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/6ee1de4b-477f-4546-8d0b-7a40edff00e6.lance b/.lancedb/nltk_chunking_BAAI.lance/data/6ee1de4b-477f-4546-8d0b-7a40edff00e6.lance new file mode 100644 index 0000000000000000000000000000000000000000..e683f7100a965a65f17973d1a13be609cecb52a6 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/6ee1de4b-477f-4546-8d0b-7a40edff00e6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:047dfdd8149883bce5d235d4de29be50e8e3b65bab23c53751f14796787238b5 +size 137969 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/6fb38b55-2493-44c7-b8d0-28f0eb87a328.lance b/.lancedb/nltk_chunking_BAAI.lance/data/6fb38b55-2493-44c7-b8d0-28f0eb87a328.lance new file mode 100644 index 0000000000000000000000000000000000000000..18f7faf816999757af89fd76cfe35176ab3d3143 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/6fb38b55-2493-44c7-b8d0-28f0eb87a328.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:249313a7d552c316d70fa90aca3419303a2519323e0e9669458106818e191bf9 +size 140368 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/7019543c-dc30-4e37-ad7c-d6d3fae5bcb7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/7019543c-dc30-4e37-ad7c-d6d3fae5bcb7.lance new file mode 100644 index 0000000000000000000000000000000000000000..51e73cb7cd03e2a028b5ef48244ff22ce8d9caae --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/7019543c-dc30-4e37-ad7c-d6d3fae5bcb7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd92997b2c9d4e460e0720efdfa133fcbbe19c59ab33bc310f52ca1e3a6688e9 +size 136042 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/70a7efaf-48b0-4cc4-be47-a8bfe38a7b17.lance b/.lancedb/nltk_chunking_BAAI.lance/data/70a7efaf-48b0-4cc4-be47-a8bfe38a7b17.lance new file mode 100644 index 0000000000000000000000000000000000000000..0fde101aaf795fd63159026de2668fec27deb7bf --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/70a7efaf-48b0-4cc4-be47-a8bfe38a7b17.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6e5c0325f1408f9f592de232956e0a6fc952333113c3522da6f252273c863ec +size 135978 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/711f1aec-3271-4233-a936-aa44f26aff01.lance b/.lancedb/nltk_chunking_BAAI.lance/data/711f1aec-3271-4233-a936-aa44f26aff01.lance new file mode 100644 index 0000000000000000000000000000000000000000..73de5cc712f28c051a53c400a6fc66abee1f852f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/711f1aec-3271-4233-a936-aa44f26aff01.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b26022d5acffb3286a54506cce74f72d34c2048334c43ae86fedebce677ef43e +size 138519 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/718243bf-d951-45a1-9437-99b4014dec04.lance b/.lancedb/nltk_chunking_BAAI.lance/data/718243bf-d951-45a1-9437-99b4014dec04.lance new file mode 100644 index 0000000000000000000000000000000000000000..a3f1fbdc5f5075c915cd1231eea060ea72cfe91e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/718243bf-d951-45a1-9437-99b4014dec04.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:938fc450674e30ff355473d09814fbaa5a29fad0fed357df57f78b347e5ff085 +size 135941 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/71ae5d39-4303-47b1-8b97-7628d37be39e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/71ae5d39-4303-47b1-8b97-7628d37be39e.lance new file mode 100644 index 0000000000000000000000000000000000000000..402c7c20ad925738288abc44eedf757f1a029c20 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/71ae5d39-4303-47b1-8b97-7628d37be39e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eab8db7590f94440e4bf91f78f24314c66d22e98dde3e5ad631df6abcd7fa1ce +size 136523 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/71b9c979-9335-464f-826b-30e9d81472aa.lance b/.lancedb/nltk_chunking_BAAI.lance/data/71b9c979-9335-464f-826b-30e9d81472aa.lance new file mode 100644 index 0000000000000000000000000000000000000000..38ba32edfbea3d75ec65cadd74a93fe305088f46 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/71b9c979-9335-464f-826b-30e9d81472aa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36daf66566f0882d4b7a6444e6163dc90b597819ae57dec83a810fab34e96991 +size 138117 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/72909ff8-6bdc-486d-b8de-4d74ca63a6b5.lance b/.lancedb/nltk_chunking_BAAI.lance/data/72909ff8-6bdc-486d-b8de-4d74ca63a6b5.lance new file mode 100644 index 0000000000000000000000000000000000000000..239fd06315d40050a9e0721fd9c08386d8fcca4f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/72909ff8-6bdc-486d-b8de-4d74ca63a6b5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86463781de6134c5592478cd9577589bc35b239001d4f061242247ae7225bd35 +size 135018 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/73689742-7d04-4d92-b425-5378bf906fa2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/73689742-7d04-4d92-b425-5378bf906fa2.lance new file mode 100644 index 0000000000000000000000000000000000000000..061b73cde358f02e6c76caa0faa9ef7079d29e45 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/73689742-7d04-4d92-b425-5378bf906fa2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66f76c321b53a1f0ecd413a158749c5c07ba91f97e526475536eb8482e87c118 +size 138268 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/74a09d97-e41c-4fd8-9b54-55fa95cb21a4.lance b/.lancedb/nltk_chunking_BAAI.lance/data/74a09d97-e41c-4fd8-9b54-55fa95cb21a4.lance new file mode 100644 index 0000000000000000000000000000000000000000..a66e94e5496f581afd202d0921ea80276075522d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/74a09d97-e41c-4fd8-9b54-55fa95cb21a4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f92e22aece1d060f07b28e1dd62d60e08836244e1fca4011e55e43ff247c8a65 +size 138517 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/74f853d0-e9b7-4dc9-b765-39b5fc255994.lance b/.lancedb/nltk_chunking_BAAI.lance/data/74f853d0-e9b7-4dc9-b765-39b5fc255994.lance new file mode 100644 index 0000000000000000000000000000000000000000..f0a586a36cd6d0765bdd7a374822b62fa75759fc --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/74f853d0-e9b7-4dc9-b765-39b5fc255994.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdf3304468c23bf9f5e693bc13b4dfb4fa2c3be8d5a7b5b77c08fda402e69866 +size 135483 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/75056963-314c-429b-a0ef-be4da4e0135a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/75056963-314c-429b-a0ef-be4da4e0135a.lance new file mode 100644 index 0000000000000000000000000000000000000000..6b5a418c5f5983b8224330fd970c244cb4ec83db --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/75056963-314c-429b-a0ef-be4da4e0135a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f356424448912c8d3ccab09e25dff96b9f9ff01bfd66847cdbd787159ac693ed +size 139326 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/76769140-5b0e-4524-a62c-f0057a31b795.lance b/.lancedb/nltk_chunking_BAAI.lance/data/76769140-5b0e-4524-a62c-f0057a31b795.lance new file mode 100644 index 0000000000000000000000000000000000000000..a455479c31e7b9e0d673b4477f342ed37d671c39 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/76769140-5b0e-4524-a62c-f0057a31b795.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3f259e95013deab9bf681a13e193248f9fc0af6fa9b331c493450e9a176df69 +size 136235 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/77a5cf58-55e4-4001-a589-73b2f24d7c17.lance b/.lancedb/nltk_chunking_BAAI.lance/data/77a5cf58-55e4-4001-a589-73b2f24d7c17.lance new file mode 100644 index 0000000000000000000000000000000000000000..ded5c68e887d17ef6e6c6682fbc8b6b649a3d825 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/77a5cf58-55e4-4001-a589-73b2f24d7c17.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa6730c50d67d898bb802639c96eed23a6ca85d187b026da99ec0183a6053e52 +size 138146 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/7803f2f0-578b-4272-9eca-5a96a378a986.lance b/.lancedb/nltk_chunking_BAAI.lance/data/7803f2f0-578b-4272-9eca-5a96a378a986.lance new file mode 100644 index 0000000000000000000000000000000000000000..7106b29487acb559c2cfbb9478e8a8ec4eaf4d13 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/7803f2f0-578b-4272-9eca-5a96a378a986.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7e0eeb5186ef0e083851bfa58b56ca46ab75fdc2820d0bb1a7ba7fa1c612b30 +size 137679 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/7ac12e95-beb4-4c0e-b83a-97c03b9836f3.lance b/.lancedb/nltk_chunking_BAAI.lance/data/7ac12e95-beb4-4c0e-b83a-97c03b9836f3.lance new file mode 100644 index 0000000000000000000000000000000000000000..7f8d3e5537dc7302b42f4ab75d40ced837661150 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/7ac12e95-beb4-4c0e-b83a-97c03b9836f3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f53d4b89dc182afdace2c4cc8540bca0fe40c8e9c4d055389289e1574e0ab712 +size 136685 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/7b722415-3a57-4e08-86e6-a2b321124dee.lance b/.lancedb/nltk_chunking_BAAI.lance/data/7b722415-3a57-4e08-86e6-a2b321124dee.lance new file mode 100644 index 0000000000000000000000000000000000000000..8571b7c376858dec07695f2a219fe81ce6f39062 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/7b722415-3a57-4e08-86e6-a2b321124dee.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2695da8c9a12c5ab90283f1e7547297af46f6a360a1ef677e79ec458755b740d +size 137574 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/7ba55a4f-c669-46e0-91cb-651f643a5631.lance b/.lancedb/nltk_chunking_BAAI.lance/data/7ba55a4f-c669-46e0-91cb-651f643a5631.lance new file mode 100644 index 0000000000000000000000000000000000000000..d4deb9be7b8e9eb242842321d2c51433bdcb2150 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/7ba55a4f-c669-46e0-91cb-651f643a5631.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e25588513d436276efc5fe13845ec8b65744ee39df535992ae72de9a8909ca8 +size 136689 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/7ca070cf-df6f-47f3-9e2c-d4c57dd575e2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/7ca070cf-df6f-47f3-9e2c-d4c57dd575e2.lance new file mode 100644 index 0000000000000000000000000000000000000000..dda333209438b9ce8b06855b5f2a9298a218da39 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/7ca070cf-df6f-47f3-9e2c-d4c57dd575e2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d3cbc725c854832688cc247ba4798fcad0fe93ff097317ecb532d95f7f02b1b +size 135577 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/7d0a4b99-2f10-4327-bd7f-764577b15751.lance b/.lancedb/nltk_chunking_BAAI.lance/data/7d0a4b99-2f10-4327-bd7f-764577b15751.lance new file mode 100644 index 0000000000000000000000000000000000000000..9b5474a76882fab571f22cfd70ae4f7ee5e43005 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/7d0a4b99-2f10-4327-bd7f-764577b15751.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75444ddb6c7ac8d5302310a3804493a565ac40218d9cfa54b461e3fd18789754 +size 139316 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/7d488c8f-6421-4614-b679-33e5d2aed5cb.lance b/.lancedb/nltk_chunking_BAAI.lance/data/7d488c8f-6421-4614-b679-33e5d2aed5cb.lance new file mode 100644 index 0000000000000000000000000000000000000000..56f78851a7b8b8badaa29b6cd34f927d749e6945 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/7d488c8f-6421-4614-b679-33e5d2aed5cb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2ce044d56afa17241d2bd1856166a24f5238565858b468af396dc3016ec2adb +size 137396 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/7e622ca4-023e-4bed-900a-ffaff47dbcb4.lance b/.lancedb/nltk_chunking_BAAI.lance/data/7e622ca4-023e-4bed-900a-ffaff47dbcb4.lance new file mode 100644 index 0000000000000000000000000000000000000000..e63500218adedf6f0440cebac3a8fdd1b6ef0882 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/7e622ca4-023e-4bed-900a-ffaff47dbcb4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55d71439ade1819feba17d7907afd834f4f814069b635467c16da53916a016e7 +size 138845 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/7f7a96ef-865c-4f2d-a28b-6179c8864158.lance b/.lancedb/nltk_chunking_BAAI.lance/data/7f7a96ef-865c-4f2d-a28b-6179c8864158.lance new file mode 100644 index 0000000000000000000000000000000000000000..0de042c3386d00294b1057d01eda85d4f56ff259 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/7f7a96ef-865c-4f2d-a28b-6179c8864158.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a528f3faff7d628018150ff94da84ba0be9c7db7a7363798193b24b420a81c3 +size 135689 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/80d195ec-b850-4ce5-85ff-16e21253e0cc.lance b/.lancedb/nltk_chunking_BAAI.lance/data/80d195ec-b850-4ce5-85ff-16e21253e0cc.lance new file mode 100644 index 0000000000000000000000000000000000000000..27170a0e0fc70261cd15162ac353eb449f06074f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/80d195ec-b850-4ce5-85ff-16e21253e0cc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58a998c6ae8fd6db9c61e21b7a208257199ba1ae2680e1ca8a59c6a6a7ef0aad +size 139082 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/80e04aac-4230-4fb7-8567-39a6e4cb1576.lance b/.lancedb/nltk_chunking_BAAI.lance/data/80e04aac-4230-4fb7-8567-39a6e4cb1576.lance new file mode 100644 index 0000000000000000000000000000000000000000..cca5bc8847d7d55ba842460f0e6a0927c982e8c7 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/80e04aac-4230-4fb7-8567-39a6e4cb1576.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa0232390dc0e29f2a3cc72d522ba5a41dbebec9cfe4a78790fd884f84ae54f +size 138811 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/80f670d3-d197-46da-b1ec-156d413a38d2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/80f670d3-d197-46da-b1ec-156d413a38d2.lance new file mode 100644 index 0000000000000000000000000000000000000000..9aba78d0b4f91176dcf19cad0aca700c1eb01423 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/80f670d3-d197-46da-b1ec-156d413a38d2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:230b1c8480571dd8dd7906a9dbae7df6566dccf0061bf2fb84cba8ecf926edd2 +size 139034 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/81342b96-ced9-4ad1-9e08-31bf113afbe7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/81342b96-ced9-4ad1-9e08-31bf113afbe7.lance new file mode 100644 index 0000000000000000000000000000000000000000..d800424c5ca301ab295c7ace257df8d91367f0fd --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/81342b96-ced9-4ad1-9e08-31bf113afbe7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54ec4783a590605727f9802712642320f7c922ed23443015071e51ae4d0d210c +size 137444 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8212d6c8-7ef3-41fe-a19e-3ea326c6cf46.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8212d6c8-7ef3-41fe-a19e-3ea326c6cf46.lance new file mode 100644 index 0000000000000000000000000000000000000000..649c665dab5b4575f78e810639b8919070a4d469 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8212d6c8-7ef3-41fe-a19e-3ea326c6cf46.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5fd2bd4dd9d3b7253a745d3d961cd46f2e41f4baf08a17142f4a17cd62691de6 +size 142813 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/825e3016-118d-4ddc-8258-b4b3d8a39efa.lance b/.lancedb/nltk_chunking_BAAI.lance/data/825e3016-118d-4ddc-8258-b4b3d8a39efa.lance new file mode 100644 index 0000000000000000000000000000000000000000..6245c4429fc8f1afeb08b8b1e80504b44a55e341 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/825e3016-118d-4ddc-8258-b4b3d8a39efa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43ef00797c1d12d6ff728c3a3ada2a94f181ce3030356e9e0f77492c97a16cb0 +size 137377 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8298f34d-ed05-407d-8222-5864e0c90016.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8298f34d-ed05-407d-8222-5864e0c90016.lance new file mode 100644 index 0000000000000000000000000000000000000000..861aa9542bf58aeedc4d60d2e93ae2d5403654d4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8298f34d-ed05-407d-8222-5864e0c90016.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:900eb3267e49ae88f407a714743738ab6a8cebbce35bf3faf5796d8b6aeb5fca +size 143536 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/82aa2856-70c1-4289-bfba-0430ce48e780.lance b/.lancedb/nltk_chunking_BAAI.lance/data/82aa2856-70c1-4289-bfba-0430ce48e780.lance new file mode 100644 index 0000000000000000000000000000000000000000..97a426a63579885236b5fe0dc2fa32ace5737875 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/82aa2856-70c1-4289-bfba-0430ce48e780.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42050be4aa15b6469935687112599d369a6f54292f3bd2194787875a38dfeb70 +size 136541 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/82e91ced-12c9-4caa-9593-9d1862249b54.lance b/.lancedb/nltk_chunking_BAAI.lance/data/82e91ced-12c9-4caa-9593-9d1862249b54.lance new file mode 100644 index 0000000000000000000000000000000000000000..7160a0ef26f221271b24b9fb8462e5d959b0834a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/82e91ced-12c9-4caa-9593-9d1862249b54.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f147f248dc71f714225157506abf7225200584c74789989790eb18b71678285 +size 137181 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/83827f0f-70ad-471e-adfb-5c1794befe27.lance b/.lancedb/nltk_chunking_BAAI.lance/data/83827f0f-70ad-471e-adfb-5c1794befe27.lance new file mode 100644 index 0000000000000000000000000000000000000000..f89af18c85b7307c01ceef2131feb0995b672160 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/83827f0f-70ad-471e-adfb-5c1794befe27.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e1ef275f6b6066f291555183f46107809d2663783daf0f1008ef46a5e694915 +size 136575 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/83e9794b-0bb4-445e-8e74-5af74836e77d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/83e9794b-0bb4-445e-8e74-5af74836e77d.lance new file mode 100644 index 0000000000000000000000000000000000000000..1ce741d72c4c1d919b5a619dbf4fda3f598716a5 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/83e9794b-0bb4-445e-8e74-5af74836e77d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48fda2613cc1ae679d735ee7b114ff545aca881d2318ae6c2023282716500b53 +size 135990 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/84d05a7e-856a-4498-94cc-f6fcbcb55071.lance b/.lancedb/nltk_chunking_BAAI.lance/data/84d05a7e-856a-4498-94cc-f6fcbcb55071.lance new file mode 100644 index 0000000000000000000000000000000000000000..a83942d87c276e0b208af8727f238e58079fdb4f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/84d05a7e-856a-4498-94cc-f6fcbcb55071.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f718ff3becd9f12f8c661b33d167067d8b1c989c77d4c1adae50051998d6ba7f +size 137299 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/854c3503-76ae-4956-a3fa-65575679e836.lance b/.lancedb/nltk_chunking_BAAI.lance/data/854c3503-76ae-4956-a3fa-65575679e836.lance new file mode 100644 index 0000000000000000000000000000000000000000..2eac6b8fe3b53ad1b035c0198c0404054d62742e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/854c3503-76ae-4956-a3fa-65575679e836.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ab520d3b14fa9f7edeeb20ddec263905a4244df3c502fe1b447e1691d33d4c6 +size 135870 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/854e3826-e322-40b4-acee-89ab5c97898f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/854e3826-e322-40b4-acee-89ab5c97898f.lance new file mode 100644 index 0000000000000000000000000000000000000000..0aad2423541f4aa9062c22526ac55798f4957f9e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/854e3826-e322-40b4-acee-89ab5c97898f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4fe176924af672bdce2cd96c57336be87e32920a7b586df665617e1bee934eb +size 137642 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/85fbdbec-8a25-4aa6-9f57-03490cdf8133.lance b/.lancedb/nltk_chunking_BAAI.lance/data/85fbdbec-8a25-4aa6-9f57-03490cdf8133.lance new file mode 100644 index 0000000000000000000000000000000000000000..d150f6956c133e1f90fbdbc6ddb28e2db60eee6b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/85fbdbec-8a25-4aa6-9f57-03490cdf8133.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19b9330bfd23de5112f6f4614f67552eba1564ffec586e002b1b7ffe35e5b84f +size 136365 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/862cf4db-3ca7-41f2-84a6-a7f85b564005.lance b/.lancedb/nltk_chunking_BAAI.lance/data/862cf4db-3ca7-41f2-84a6-a7f85b564005.lance new file mode 100644 index 0000000000000000000000000000000000000000..3edda47b35196b8554b5a0af68144b2ff10abf70 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/862cf4db-3ca7-41f2-84a6-a7f85b564005.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8601690282b9a878303113b8ac14e3c1378c68f932faf293a92dd616db2fe62 +size 138839 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/876de37e-f545-4a8a-a129-228824ae20fa.lance b/.lancedb/nltk_chunking_BAAI.lance/data/876de37e-f545-4a8a-a129-228824ae20fa.lance new file mode 100644 index 0000000000000000000000000000000000000000..c3f3b387e4f0437e8e14a701bebdb8cb57d1f94b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/876de37e-f545-4a8a-a129-228824ae20fa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f81f43befbd44616e5bd5b2bebaad3663d0aa3fb17692d0cf7bc5675ccb60ba6 +size 137095 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/87e2eeef-46c3-44a3-b195-9aaf9f10e57d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/87e2eeef-46c3-44a3-b195-9aaf9f10e57d.lance new file mode 100644 index 0000000000000000000000000000000000000000..2dd4f5420b9ecc64e6914b637f3bf499ec549802 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/87e2eeef-46c3-44a3-b195-9aaf9f10e57d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ae50f1a7205796f12fa9aa9ba5920335aa6e1d81b99508e49758ad34d572a25 +size 137663 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/87fe632a-d48a-4e84-bae6-1eeec76712e7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/87fe632a-d48a-4e84-bae6-1eeec76712e7.lance new file mode 100644 index 0000000000000000000000000000000000000000..e2256c8c5b16c9c033bf5d31c50afa3c0dc93eb7 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/87fe632a-d48a-4e84-bae6-1eeec76712e7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0577b2eb373594f02a9c0f1fac93b95d8fe3b897da200f5a1d69979e0520788 +size 137307 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/880dc403-3811-4e76-9f41-559f665e1219.lance b/.lancedb/nltk_chunking_BAAI.lance/data/880dc403-3811-4e76-9f41-559f665e1219.lance new file mode 100644 index 0000000000000000000000000000000000000000..b5698d5d7635f6a9a830d0ad0ed26731f3598330 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/880dc403-3811-4e76-9f41-559f665e1219.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d6ba5400c8426f6ee5acb31b540bf77d04b8da98d9866968f1d45053f5ef46c +size 137944 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/88b80137-7baa-490e-b14d-73ce7a083955.lance b/.lancedb/nltk_chunking_BAAI.lance/data/88b80137-7baa-490e-b14d-73ce7a083955.lance new file mode 100644 index 0000000000000000000000000000000000000000..61762c8dc35bdf07aaa0462aa904338494ba7b72 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/88b80137-7baa-490e-b14d-73ce7a083955.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3be4845479a837bc0f69afd4ccca6d923df78960867d7619a13beda799300e8b +size 137364 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8a575cff-9314-4e1c-940e-894e332ce1fd.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8a575cff-9314-4e1c-940e-894e332ce1fd.lance new file mode 100644 index 0000000000000000000000000000000000000000..f89fcb7e48056338b8a2c042321e3004364531be --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8a575cff-9314-4e1c-940e-894e332ce1fd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca12dfedb4987898217e505b2fee8148659556483ebc9b76098f60548cbb24d5 +size 136938 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8b3b7f78-4c8a-4500-aa70-dc4dbd0a3b22.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8b3b7f78-4c8a-4500-aa70-dc4dbd0a3b22.lance new file mode 100644 index 0000000000000000000000000000000000000000..8d59ea0ea1f28b821a2e490192f5e76536824daf --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8b3b7f78-4c8a-4500-aa70-dc4dbd0a3b22.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55498a571ab12d1719880d2b11e48d1110bd65ea04b2fb3479f3ac471e3c2461 +size 139038 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8bedf0d2-e833-422e-8e75-fd0b364f8cae.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8bedf0d2-e833-422e-8e75-fd0b364f8cae.lance new file mode 100644 index 0000000000000000000000000000000000000000..590c453d223ba2dc5a75998e4ea9c54267df2274 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8bedf0d2-e833-422e-8e75-fd0b364f8cae.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a4a581a2e98617284751c9b9150b3ff344c1257e656639c49974789fc87a214 +size 137642 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8ce86536-e0eb-407d-9af0-0d1a58005b94.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8ce86536-e0eb-407d-9af0-0d1a58005b94.lance new file mode 100644 index 0000000000000000000000000000000000000000..fd18705d75ea2686517d1c558c30cc4e08d61b59 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8ce86536-e0eb-407d-9af0-0d1a58005b94.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67c1b8b5bac3c0481d4ce292c23a553c1fe5ce9a398c8260d9607fd2fb889470 +size 135376 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8cf13c63-6a0c-4dc5-8f79-ae367335f59a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8cf13c63-6a0c-4dc5-8f79-ae367335f59a.lance new file mode 100644 index 0000000000000000000000000000000000000000..9b59ab49f1ed5788a371eb57a441837a59572d43 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8cf13c63-6a0c-4dc5-8f79-ae367335f59a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4997453915c3f46785f53b3842bc60f96e8142dcbfd5663353c713ecd5044f57 +size 137395 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8d899c15-b1d0-411c-bee4-5e601e6e0776.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8d899c15-b1d0-411c-bee4-5e601e6e0776.lance new file mode 100644 index 0000000000000000000000000000000000000000..0ddef8896544b1f37fee9e2e7b3fdec059d2a45e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8d899c15-b1d0-411c-bee4-5e601e6e0776.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0074cbc591c6d7cf80e17003c2013074ea33163e3f915e362eac1775f9dcecb +size 136311 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8dadf027-3f45-4bd1-95d1-4d6a990b42ca.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8dadf027-3f45-4bd1-95d1-4d6a990b42ca.lance new file mode 100644 index 0000000000000000000000000000000000000000..93916ff987899c469d719f0c79a97b169720b8e2 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8dadf027-3f45-4bd1-95d1-4d6a990b42ca.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:165746898147a87bdb4f49665253abac1e1ccf22da916e91ad63bce94f861adf +size 138990 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8e71341b-c133-48f3-bcf9-6d09cc6b4efb.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8e71341b-c133-48f3-bcf9-6d09cc6b4efb.lance new file mode 100644 index 0000000000000000000000000000000000000000..4fc155edc5c24d57acb7b9e832a758e97b68698f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8e71341b-c133-48f3-bcf9-6d09cc6b4efb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6778fffcc25bebbf2ca603b641fa836c995920b173b75ff204798f48515eac0c +size 135974 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8f1f5880-2d21-498b-a3e4-7689efb6e5d0.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8f1f5880-2d21-498b-a3e4-7689efb6e5d0.lance new file mode 100644 index 0000000000000000000000000000000000000000..436c4c1b44bf57814248c92536a3c32a28e765f6 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8f1f5880-2d21-498b-a3e4-7689efb6e5d0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8bfddbc289fef681a7fdc0c6f083bfaa20e52a8c75386cfd3ec5d860b5c294d +size 139390 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8f4139fc-78f9-4a52-8455-12b97241bb77.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8f4139fc-78f9-4a52-8455-12b97241bb77.lance new file mode 100644 index 0000000000000000000000000000000000000000..b55a6faeff8c64faeb90cf645bdce7ec6852f290 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8f4139fc-78f9-4a52-8455-12b97241bb77.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88cbc8d983aa4a16dcfda457e98b4c5c991eb83127aadfb4cad58060f39fbd41 +size 138495 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/8ff9d7fb-61e4-4e45-a9c7-41c791dcfc16.lance b/.lancedb/nltk_chunking_BAAI.lance/data/8ff9d7fb-61e4-4e45-a9c7-41c791dcfc16.lance new file mode 100644 index 0000000000000000000000000000000000000000..a0738220736885dd7f1f9e4fca2d69aaa1e441b0 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/8ff9d7fb-61e4-4e45-a9c7-41c791dcfc16.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09fd93ffa06550960e6b0abfd4465024c58f6e622ff53b435d8a2d3d7c86ef8a +size 136516 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/91292b44-80c7-4f4b-9a8a-d3947774060b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/91292b44-80c7-4f4b-9a8a-d3947774060b.lance new file mode 100644 index 0000000000000000000000000000000000000000..ffb7abe9a1e905c1d54c257bd4bc84fd225d7630 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/91292b44-80c7-4f4b-9a8a-d3947774060b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4a7dee5e7c39da93e18ea6d804f72195060b6434a16a26ee7566cc5cc24bf9f +size 138468 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9244d387-0e58-467b-af29-b85782848fea.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9244d387-0e58-467b-af29-b85782848fea.lance new file mode 100644 index 0000000000000000000000000000000000000000..bd7c420cf640be8a0252c6ace149a415efc94ac6 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9244d387-0e58-467b-af29-b85782848fea.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0f05fd2a800b181ae5081dc329a3087142ed69440df2a55d025f3585f4c354e +size 137330 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/93973fdc-1c63-4d43-b037-25d9e2575da0.lance b/.lancedb/nltk_chunking_BAAI.lance/data/93973fdc-1c63-4d43-b037-25d9e2575da0.lance new file mode 100644 index 0000000000000000000000000000000000000000..c2b0e2fe3c48364667180acaceb17622964ab1d0 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/93973fdc-1c63-4d43-b037-25d9e2575da0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13ea05f7b0495f8a3b59860ace48776948fc56f5df9355b3efc5e3a57ebdf87d +size 137990 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/94b958f0-c1a4-4e23-ad5f-bc8311ec981d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/94b958f0-c1a4-4e23-ad5f-bc8311ec981d.lance new file mode 100644 index 0000000000000000000000000000000000000000..e89c3c1c8330ad16ef53f9bc1bf3cd9ea5e536ef --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/94b958f0-c1a4-4e23-ad5f-bc8311ec981d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bd5c4b409904c509036dfc38e392a2b083afd343ad4277d13486890cefa88a3 +size 136887 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/953779e2-1548-4862-9357-32611a4428e9.lance b/.lancedb/nltk_chunking_BAAI.lance/data/953779e2-1548-4862-9357-32611a4428e9.lance new file mode 100644 index 0000000000000000000000000000000000000000..db74a236cde54366ba4ff940f1d705b4a17ed832 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/953779e2-1548-4862-9357-32611a4428e9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79eac893a83692470e7f31bf320eb95217458eaaf0cb1736510895a2df2281ad +size 137931 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/959b5f93-61be-4dd4-ae8a-117af3392297.lance b/.lancedb/nltk_chunking_BAAI.lance/data/959b5f93-61be-4dd4-ae8a-117af3392297.lance new file mode 100644 index 0000000000000000000000000000000000000000..dae3c65025e3979357db69fde5285afe3d2a69e9 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/959b5f93-61be-4dd4-ae8a-117af3392297.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a773bc67808e6277290a37e916fceab5ea51b47f5a56c9a506d77a488be8b223 +size 137653 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9612b3d1-8fdf-4ef6-bd50-f850b68d5a18.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9612b3d1-8fdf-4ef6-bd50-f850b68d5a18.lance new file mode 100644 index 0000000000000000000000000000000000000000..2c647a7775d372baf0c282298b5127c34669fc16 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9612b3d1-8fdf-4ef6-bd50-f850b68d5a18.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e6b8517a6954625f8c4de3c3b84e2ca9159ee4cfcbeda8a7baed6266b0de5d1 +size 139649 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9806bb3a-db30-45b2-92e4-2d9244ada254.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9806bb3a-db30-45b2-92e4-2d9244ada254.lance new file mode 100644 index 0000000000000000000000000000000000000000..7429662f5fe2015d921fd3d816026fd764f9f9c5 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9806bb3a-db30-45b2-92e4-2d9244ada254.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3880003da420820165430fdc34f27bc4b358866f199ff61fe51f279d4072880 +size 136358 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9845a23e-92b8-4ec5-8a0d-f7b6e90718ce.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9845a23e-92b8-4ec5-8a0d-f7b6e90718ce.lance new file mode 100644 index 0000000000000000000000000000000000000000..fd952112438fe73e1d0836a1aab3f30fbcb70f99 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9845a23e-92b8-4ec5-8a0d-f7b6e90718ce.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c1e52dbdfbbc0c056a753ca563d66ec0be60a514db48e680c0f810e5175cc3f +size 138952 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/98732e49-337e-47db-9a5b-413c6ded7c7a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/98732e49-337e-47db-9a5b-413c6ded7c7a.lance new file mode 100644 index 0000000000000000000000000000000000000000..6d0fc7880a117f3185056c61c49ea1cd08e88f9c --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/98732e49-337e-47db-9a5b-413c6ded7c7a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c87e594c6d39a890a64e139e899cb0ac11cfa49cc4a8eb800c8deaad818ef1a +size 137990 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/98bf54e0-2eec-467c-84a6-a201d04c5d6d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/98bf54e0-2eec-467c-84a6-a201d04c5d6d.lance new file mode 100644 index 0000000000000000000000000000000000000000..57a9a27485ce2cce076a05795c8c698d7422086a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/98bf54e0-2eec-467c-84a6-a201d04c5d6d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f614ca8eda50987fe4189d4124b84756f9f8246eec058cc419589b34c198bac5 +size 140659 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/99033610-3a76-4acf-b806-278060dd3e7e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/99033610-3a76-4acf-b806-278060dd3e7e.lance new file mode 100644 index 0000000000000000000000000000000000000000..59d61129efcff3ce68ec375da5aae189b97ee888 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/99033610-3a76-4acf-b806-278060dd3e7e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f22ab1db62c4305246982be3ed389f9f6652fabfea6f71dda532f95fbacc4d88 +size 136045 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9940cef4-4191-49aa-aba1-93f0cce15bdd.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9940cef4-4191-49aa-aba1-93f0cce15bdd.lance new file mode 100644 index 0000000000000000000000000000000000000000..160022f4df77535eb4187d654680f7ce4d0006a8 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9940cef4-4191-49aa-aba1-93f0cce15bdd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe1329970d49818a8d7b80967be3d93927edba6f3b7da666a00ee898c23303d +size 138657 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9b6fa3ca-9fd9-443b-8b02-7d172cd2c981.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9b6fa3ca-9fd9-443b-8b02-7d172cd2c981.lance new file mode 100644 index 0000000000000000000000000000000000000000..42f0d9bf0bd8361b4deb31135cbc204930231bf9 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9b6fa3ca-9fd9-443b-8b02-7d172cd2c981.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d82d134fbd8d2e596fb8f27060e376377dad90a5b925c13ad8cd4eda0257d96 +size 135562 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9be3af86-9d63-4ec5-b126-9132dd847235.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9be3af86-9d63-4ec5-b126-9132dd847235.lance new file mode 100644 index 0000000000000000000000000000000000000000..7c1650da5803e6de51c86c6db52f933626eaa3d9 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9be3af86-9d63-4ec5-b126-9132dd847235.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc66a96a5c680a452c057e777eabe5e6417626ac78c3ccec033be934d2ed6084 +size 140472 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9c123ecb-a0b9-4efd-9818-db66d23b2682.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9c123ecb-a0b9-4efd-9818-db66d23b2682.lance new file mode 100644 index 0000000000000000000000000000000000000000..b5dbb2a1b8e7f2923f65ae8e106d75319f37b928 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9c123ecb-a0b9-4efd-9818-db66d23b2682.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54ad8a906d8e33612481aa97f45e073e678325519a1bc3ab62b4e6b9abe54d45 +size 135439 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9c3a235b-208c-41f7-9cd1-8063ae6b35e3.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9c3a235b-208c-41f7-9cd1-8063ae6b35e3.lance new file mode 100644 index 0000000000000000000000000000000000000000..87a83ccf3d20fd71f8312c6844f794ee57687dea --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9c3a235b-208c-41f7-9cd1-8063ae6b35e3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee9bbc287ef555b78f7b335427bea4bec2e96a207d5690dde07a6e8a7bd07cd0 +size 136562 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9c50a517-54cb-4220-9e65-5fe2940b503e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9c50a517-54cb-4220-9e65-5fe2940b503e.lance new file mode 100644 index 0000000000000000000000000000000000000000..7f243b44ff2e2c423ca2f9e294f9f52ab0ea68d1 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9c50a517-54cb-4220-9e65-5fe2940b503e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:54d7f8d75bfe0100af04623d3a45090e2bce1bd4d3200da4b2daca0f7581a3c3 +size 141519 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9c9b93ec-bf94-41a7-bc2c-334ee91638f0.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9c9b93ec-bf94-41a7-bc2c-334ee91638f0.lance new file mode 100644 index 0000000000000000000000000000000000000000..5004dd92369e30b55e14744c00d092d6b4258de4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9c9b93ec-bf94-41a7-bc2c-334ee91638f0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d5db73f582508a8c3c4db78287c2a65ac731708973b5af1c36ea5ce844febe4 +size 136390 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9dcda0fc-1801-4b2d-8d53-188de24eaeec.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9dcda0fc-1801-4b2d-8d53-188de24eaeec.lance new file mode 100644 index 0000000000000000000000000000000000000000..05accec26c693dc5adf16293ac25967f791ead46 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9dcda0fc-1801-4b2d-8d53-188de24eaeec.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:577f1ebb2b37485e88e92f6701e25d4879ca90fab85c4734e777d447b515c51f +size 138815 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9e2a5039-82fb-49d8-ada5-acff9503fc7f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9e2a5039-82fb-49d8-ada5-acff9503fc7f.lance new file mode 100644 index 0000000000000000000000000000000000000000..9f214de0a2b203cb92ad538f882527dc4830f40a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9e2a5039-82fb-49d8-ada5-acff9503fc7f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4738fd0ba0bd2f2e6878ead41708664c5237d22b7d59810373137154e07967f3 +size 137579 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9e7c9598-761e-4fcc-8a43-076d1cfd7903.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9e7c9598-761e-4fcc-8a43-076d1cfd7903.lance new file mode 100644 index 0000000000000000000000000000000000000000..b581157571ab1a0f382ba392a81b4cf40b1eeaf2 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9e7c9598-761e-4fcc-8a43-076d1cfd7903.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2abfd511d6556d7b3f56148ae40c20ee5264db2c019d8703ab3175f44f0f386 +size 137945 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/9f5fc55d-8d42-4c2f-b5e7-b37b9e61529c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/9f5fc55d-8d42-4c2f-b5e7-b37b9e61529c.lance new file mode 100644 index 0000000000000000000000000000000000000000..f02b7262514091edeb953dad4a2e90ecd6a84914 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/9f5fc55d-8d42-4c2f-b5e7-b37b9e61529c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00a71a4f50f8d9a7ef9c281f5201495562f5485de4b69d5f2f4f0191344eb7af +size 140362 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a008797b-d582-4cac-b874-3ca6ff19375b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a008797b-d582-4cac-b874-3ca6ff19375b.lance new file mode 100644 index 0000000000000000000000000000000000000000..46c73cc84f7c251811b9f07f51c9c2c6a366d003 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a008797b-d582-4cac-b874-3ca6ff19375b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f42552cdc4ceb0a65fb0bdb3fed13f7563064ebd8167e6fd1eb0e56eb591757 +size 142413 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a0bf35cf-8072-42f7-9a4f-a4ca51854653.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a0bf35cf-8072-42f7-9a4f-a4ca51854653.lance new file mode 100644 index 0000000000000000000000000000000000000000..49c7c57d9b231ae0c183f8ca73e502abe7842cb7 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a0bf35cf-8072-42f7-9a4f-a4ca51854653.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26ac31e3a5e2a86d3c8a04cec095ea87fae2cacc0b4154d2b9ef9e33047f0ce5 +size 137604 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a0d28bc3-379a-4a69-ba0e-8620069a9608.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a0d28bc3-379a-4a69-ba0e-8620069a9608.lance new file mode 100644 index 0000000000000000000000000000000000000000..c3ad1fba6ca906022e13e565e727fd6d1e3d0bce --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a0d28bc3-379a-4a69-ba0e-8620069a9608.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b746d17b7d828524cffbb6c7fe09226d9c5191464643643a11f8592dc5adb629 +size 137762 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a24f4453-7379-41aa-a5dc-0b8bdc887d67.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a24f4453-7379-41aa-a5dc-0b8bdc887d67.lance new file mode 100644 index 0000000000000000000000000000000000000000..d9df7870db7744337ee9428408a6f33cd263a404 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a24f4453-7379-41aa-a5dc-0b8bdc887d67.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca031254d1768a741f764402e759367e8b39221fc519da5b6510e18a23a6f665 +size 137761 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a25a554c-3db5-4f5f-b00f-306c5e9e7e71.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a25a554c-3db5-4f5f-b00f-306c5e9e7e71.lance new file mode 100644 index 0000000000000000000000000000000000000000..1e7b3565a918391b2525fda519b9f1b4970f7f32 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a25a554c-3db5-4f5f-b00f-306c5e9e7e71.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e98d4d964a65c40708090ef09288d422f208952c00561aaf447c10c37d62307b +size 140712 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a2e28874-77ff-4d9b-b947-4f99f7dd7d85.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a2e28874-77ff-4d9b-b947-4f99f7dd7d85.lance new file mode 100644 index 0000000000000000000000000000000000000000..7aee72e97e74d6af9b1f985fe0cf837390988089 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a2e28874-77ff-4d9b-b947-4f99f7dd7d85.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b658d268cfe751da7bea1ca8620885f3aa337eb36bbd29d6d8cf13ea85a0d38 +size 137296 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a3fec5b2-0c35-4c0f-acd1-7ab6125c1a44.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a3fec5b2-0c35-4c0f-acd1-7ab6125c1a44.lance new file mode 100644 index 0000000000000000000000000000000000000000..2a1a7c5639f12ce6db090940fcc4d08ec7bc32c8 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a3fec5b2-0c35-4c0f-acd1-7ab6125c1a44.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0851fdb77f431642f2719134852c0fd15acd08b1317ac190519f2826236d2564 +size 135733 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a4710a26-73f4-4cdb-99dd-29712c31d74e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a4710a26-73f4-4cdb-99dd-29712c31d74e.lance new file mode 100644 index 0000000000000000000000000000000000000000..ff6bd001e575d7ac072609b85ba8f65498d026e3 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a4710a26-73f4-4cdb-99dd-29712c31d74e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2dc5e38837a9034866d291db0b59bf01bf968debae779174a9fee2a4e4c1578 +size 138688 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a500ea57-7839-4a5e-900f-ddd5b553bb81.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a500ea57-7839-4a5e-900f-ddd5b553bb81.lance new file mode 100644 index 0000000000000000000000000000000000000000..9efad9bc02f407af78ee021cdcc682f475bf1237 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a500ea57-7839-4a5e-900f-ddd5b553bb81.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:620d76bafaf7effbda39635769a7d1b587a855f9a9efbb720220957d92d75468 +size 140580 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a6094e82-bea2-48e2-a632-7c43ea42de9d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a6094e82-bea2-48e2-a632-7c43ea42de9d.lance new file mode 100644 index 0000000000000000000000000000000000000000..2aaecf2d0636582d4edb750b788dd6d7f172cc10 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a6094e82-bea2-48e2-a632-7c43ea42de9d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:966d7f9fd4084042fde703d5b3d10e73bd7df2faf6dfde614d4dfbaf2f0f6c9b +size 136963 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a63e691d-8fa1-4318-92bd-65bc7c0eb590.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a63e691d-8fa1-4318-92bd-65bc7c0eb590.lance new file mode 100644 index 0000000000000000000000000000000000000000..a6802c80ba033f69bfac48f12c487e324938eb37 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a63e691d-8fa1-4318-92bd-65bc7c0eb590.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:132d35d3383172c62f4a145547b96a318bddf5b47bbb7b73d67429569756159f +size 137392 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a72fc5ce-8797-415f-868f-9fa9bcc70878.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a72fc5ce-8797-415f-868f-9fa9bcc70878.lance new file mode 100644 index 0000000000000000000000000000000000000000..c3ef86b3f05cfc7c8f256905ebae76b570aa25fd --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a72fc5ce-8797-415f-868f-9fa9bcc70878.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6f8cc0ba9d17afb0554892873b3cc0e10cfdfb773f107bd86d91a63c6fba5ef +size 136774 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a761a8cd-d489-4032-a345-4d278bd0a523.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a761a8cd-d489-4032-a345-4d278bd0a523.lance new file mode 100644 index 0000000000000000000000000000000000000000..e2dd2f8be33a66744afb023edd0e3f463479c4a9 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a761a8cd-d489-4032-a345-4d278bd0a523.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f3caf118bdcb23e3a1cee36d24ef4b60f616234570720e830c9b3ca7974d9bc +size 137883 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a7a04bee-948e-48af-83a0-3cc1265ce8b7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a7a04bee-948e-48af-83a0-3cc1265ce8b7.lance new file mode 100644 index 0000000000000000000000000000000000000000..16fa3a60b09237d122d66c7f92796160154d9124 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a7a04bee-948e-48af-83a0-3cc1265ce8b7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b562adc387725d26f5f8b6f1b13d5802e8e0e4ec81209ec45b690f3f1a105255 +size 136266 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a7b49384-173a-4357-b60d-1406aff667ff.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a7b49384-173a-4357-b60d-1406aff667ff.lance new file mode 100644 index 0000000000000000000000000000000000000000..c63a58e1a2b1e4eb8cb2e92531a886e366af0b68 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a7b49384-173a-4357-b60d-1406aff667ff.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4e148c8bffac44afccf13093af5bf3b1d4f1c187030efc3d1330a992bceb666 +size 137752 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a7f81199-d80f-4e4e-8b18-b6080590e7bc.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a7f81199-d80f-4e4e-8b18-b6080590e7bc.lance new file mode 100644 index 0000000000000000000000000000000000000000..cbd9675a831e1b840d7e41f7c691ca90e2619a4a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a7f81199-d80f-4e4e-8b18-b6080590e7bc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89f07443cc42ee2bef0d6c2570037770f36866589d34a8ac118e6324d74ede56 +size 137734 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a7ffb72c-c2bd-43c6-97e9-e5f566a4a7ee.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a7ffb72c-c2bd-43c6-97e9-e5f566a4a7ee.lance new file mode 100644 index 0000000000000000000000000000000000000000..251013be9d06734a2f598a78fea8c5e6b5ba447d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a7ffb72c-c2bd-43c6-97e9-e5f566a4a7ee.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fec8f8b723552d070f8f7346493cebe5fd1370e4714cdc65abada69d19d198d8 +size 135486 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a80ebe28-cdb0-445e-abfd-910d0ed23580.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a80ebe28-cdb0-445e-abfd-910d0ed23580.lance new file mode 100644 index 0000000000000000000000000000000000000000..ee1487fbaebe821d8dcf1c2f3d7fcf5428528604 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a80ebe28-cdb0-445e-abfd-910d0ed23580.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47b0edb41ad3c1f3bee291303a26dabcee89ef5c9bb251d03d44c9255f87c810 +size 138036 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a840d7ed-03e3-4885-8e8e-cbfc4f5a3cab.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a840d7ed-03e3-4885-8e8e-cbfc4f5a3cab.lance new file mode 100644 index 0000000000000000000000000000000000000000..44c58da4b33e0079dcd76e958834460b58b31b9c --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a840d7ed-03e3-4885-8e8e-cbfc4f5a3cab.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a130c93bbc5df1b34cab07a2dc87b88611df6e339514ebcd417cc712b691710c +size 138676 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a98595f9-b431-49e9-844c-31b5dc6737d8.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a98595f9-b431-49e9-844c-31b5dc6737d8.lance new file mode 100644 index 0000000000000000000000000000000000000000..bb31fdf4b6612beb617e7a3ea99fb428f5491b51 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a98595f9-b431-49e9-844c-31b5dc6737d8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66e2a14d2e205de76fcea89330d95b237052b7ebaf382dc0d9c96aea38672ada +size 139848 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/a99f9ca1-89c6-49bc-8332-e43b76fa6320.lance b/.lancedb/nltk_chunking_BAAI.lance/data/a99f9ca1-89c6-49bc-8332-e43b76fa6320.lance new file mode 100644 index 0000000000000000000000000000000000000000..5d258bbb8d84c47814efa68216616542e9c30cdf --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/a99f9ca1-89c6-49bc-8332-e43b76fa6320.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc4aa4333ed45d906979c316747a27ff358be17399bb960fa941b59015d6e95e +size 135310 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/aa2071b9-d65d-413b-91d1-250c73565835.lance b/.lancedb/nltk_chunking_BAAI.lance/data/aa2071b9-d65d-413b-91d1-250c73565835.lance new file mode 100644 index 0000000000000000000000000000000000000000..92a6fb17b896016881759442b8c508344c883ae9 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/aa2071b9-d65d-413b-91d1-250c73565835.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63f141e21a3b8b2e74f5290b13fc398e721bc9607b4bc1b7a853e3b0da7e95bf +size 137463 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/aa2f3e2b-d6cf-4d85-aaa8-5ac1f1c62c40.lance b/.lancedb/nltk_chunking_BAAI.lance/data/aa2f3e2b-d6cf-4d85-aaa8-5ac1f1c62c40.lance new file mode 100644 index 0000000000000000000000000000000000000000..38ec42e1c8b56557633a647153ef974230737932 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/aa2f3e2b-d6cf-4d85-aaa8-5ac1f1c62c40.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39875321af77ba9ebc655d20b4a29877aa54eed3a71bb8bfb81f03bb511ac91d +size 143434 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ab05a583-6fc1-4431-bf9d-0b58eb1e5704.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ab05a583-6fc1-4431-bf9d-0b58eb1e5704.lance new file mode 100644 index 0000000000000000000000000000000000000000..1f4fe92d79cf416035427f2dbeb9ad92afcfa484 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ab05a583-6fc1-4431-bf9d-0b58eb1e5704.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3654ef3912d06a3dd2440982448253fae5a4cd15d4c5276a543afe7a87d6fa0 +size 136321 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/adf75c21-1b9e-484b-872f-93c28b87724c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/adf75c21-1b9e-484b-872f-93c28b87724c.lance new file mode 100644 index 0000000000000000000000000000000000000000..07c722ac3a6d167e8883163df58f2cd1797feabb --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/adf75c21-1b9e-484b-872f-93c28b87724c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0414b379efa5cecebc83034a8a8eae8e1fe99e1ed9c43309a015637578142c38 +size 136276 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ae8fd0e1-a164-4daa-a33a-598b558d1258.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ae8fd0e1-a164-4daa-a33a-598b558d1258.lance new file mode 100644 index 0000000000000000000000000000000000000000..70cdab7c284a8de910848cb94b19289729c2e104 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ae8fd0e1-a164-4daa-a33a-598b558d1258.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75d2a37bfa5dbb689f378778041f84f9790735892e170c9f1bb67cd63f5204c4 +size 142379 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/af5c941b-0ee9-421d-badd-cb75d1d2f26c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/af5c941b-0ee9-421d-badd-cb75d1d2f26c.lance new file mode 100644 index 0000000000000000000000000000000000000000..16eebb0ba5f3d24658d5c58d5ab15bc6cc019181 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/af5c941b-0ee9-421d-badd-cb75d1d2f26c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2a408d64b6424122b5c8b872e4375c343fa2482726a43c7bc7c1198e6a9e159 +size 140830 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/af5f2646-4562-4b02-9d7c-50482bc32221.lance b/.lancedb/nltk_chunking_BAAI.lance/data/af5f2646-4562-4b02-9d7c-50482bc32221.lance new file mode 100644 index 0000000000000000000000000000000000000000..b7992ab3d7c334429233daef0c82a146b53bd597 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/af5f2646-4562-4b02-9d7c-50482bc32221.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5cbbee618c28d67fa14aae135468b0095ee397d90f0a7aa476ba3afb2a13b18 +size 137688 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b02bf797-776b-4fb0-845d-648e40aad113.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b02bf797-776b-4fb0-845d-648e40aad113.lance new file mode 100644 index 0000000000000000000000000000000000000000..ccba04c18a56dc7552b466ac5b626eb65e86e641 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b02bf797-776b-4fb0-845d-648e40aad113.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d66510f41057064a9d6f57722a65be804be05205b4a58e0ab4c20ae8060ae50 +size 136496 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b09989fb-36b9-4c9a-9973-7c5101df31d0.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b09989fb-36b9-4c9a-9973-7c5101df31d0.lance new file mode 100644 index 0000000000000000000000000000000000000000..4bd37ffa275884ce5fabba40c2b620bb3c303a40 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b09989fb-36b9-4c9a-9973-7c5101df31d0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e2cac0f4cfa794c8ea92d8fb626183e77d8e3d9c0a246dcfcff7eb600537bc0 +size 137017 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b133962d-9062-4e0e-97e8-ff88674b590f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b133962d-9062-4e0e-97e8-ff88674b590f.lance new file mode 100644 index 0000000000000000000000000000000000000000..3da1a7f3bbf8a7eff1e9d25eaadb31a889c636e8 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b133962d-9062-4e0e-97e8-ff88674b590f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1463826f2cdb99b026c7ab8291faede69f9f3778cf6e58232f5bfc2d1166626f +size 135869 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b160713b-f578-4439-b788-a3db08f616a0.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b160713b-f578-4439-b788-a3db08f616a0.lance new file mode 100644 index 0000000000000000000000000000000000000000..c2be3dd12b54b1c9aa1270ab53d745abc189d469 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b160713b-f578-4439-b788-a3db08f616a0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:139dda1c33439f199ce799bbe2996cd036a1c8048d8cdbc419de3f6b0ab6a5f7 +size 136747 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b1cf6733-166e-48d4-8b60-66f95e97a8c6.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b1cf6733-166e-48d4-8b60-66f95e97a8c6.lance new file mode 100644 index 0000000000000000000000000000000000000000..6055dbeee6e3a39ad8bd3ad2258f75410b3790c3 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b1cf6733-166e-48d4-8b60-66f95e97a8c6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d40d3302389dd6237064ee983fb3c280216281e13ae8cdc381d87801ff8df70 +size 138948 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b210e748-5a3f-480d-8095-dc944e7d6ac7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b210e748-5a3f-480d-8095-dc944e7d6ac7.lance new file mode 100644 index 0000000000000000000000000000000000000000..8e7cf9e8f8259b5c0b5857e9fe66fc204d1fe674 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b210e748-5a3f-480d-8095-dc944e7d6ac7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9912f660e2016e120060165e4f3e841699d8c61a72ae749fb49958c903ea2b8 +size 140534 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b280cb9e-0d44-48c7-af9f-7df959e24f85.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b280cb9e-0d44-48c7-af9f-7df959e24f85.lance new file mode 100644 index 0000000000000000000000000000000000000000..edddaffa210e9a66cdebf48e55e61a18c6560005 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b280cb9e-0d44-48c7-af9f-7df959e24f85.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad27a9c95249182989a51b1951e7686af6e53594450ed1bc0f080b537f42f369 +size 135709 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b2eefe3a-6e42-4a37-ae57-75e77d103b3c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b2eefe3a-6e42-4a37-ae57-75e77d103b3c.lance new file mode 100644 index 0000000000000000000000000000000000000000..88b1d057242505e2e9a882c558b231d0bdd6f38b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b2eefe3a-6e42-4a37-ae57-75e77d103b3c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:941a4e9f69e72cc98d6877e74453c6312bcb6f301e0ae2dea61c715cbd1f957b +size 139334 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b3077620-c4d1-4850-8855-c018446f28e2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b3077620-c4d1-4850-8855-c018446f28e2.lance new file mode 100644 index 0000000000000000000000000000000000000000..714feae9efc9e5e18cba7d6231b81017c901f938 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b3077620-c4d1-4850-8855-c018446f28e2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8df2dc2bc5f21f06fa49b4f9d696bc137692dcb8dc63752aee0d91f005d3f1b7 +size 135645 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b43eb2f3-7ace-423c-908d-6322171976b4.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b43eb2f3-7ace-423c-908d-6322171976b4.lance new file mode 100644 index 0000000000000000000000000000000000000000..9063ea6e8d9368773fe48d85933a09d5029092fc --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b43eb2f3-7ace-423c-908d-6322171976b4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b67974b970b6fe58b75c398f413dfd81d43e48ee3ba251aabb9578e43a1815c2 +size 138494 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b4a6a1d6-8a82-4bb8-9b96-674cf3234ec4.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b4a6a1d6-8a82-4bb8-9b96-674cf3234ec4.lance new file mode 100644 index 0000000000000000000000000000000000000000..0f984a4328f7c465a0c59a3c79d87951dbcfc11f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b4a6a1d6-8a82-4bb8-9b96-674cf3234ec4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98560d6c16e29ddc25ee1dd58b778384379c65cfe2e4ca498ecaa04eaad6bb5c +size 140577 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b4c2db38-f158-48f5-83d8-eb1e52b6a268.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b4c2db38-f158-48f5-83d8-eb1e52b6a268.lance new file mode 100644 index 0000000000000000000000000000000000000000..8ce55a1f1b7d1f254e070a756a2f46ac74352a33 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b4c2db38-f158-48f5-83d8-eb1e52b6a268.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c6b6b8eb27ed2f5f7f8ee519feb701ac934bd7bae18581740de2a6f6bd5bcfc +size 139281 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b4d70967-5f85-485d-a0e7-7c24554aa5c1.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b4d70967-5f85-485d-a0e7-7c24554aa5c1.lance new file mode 100644 index 0000000000000000000000000000000000000000..adbfa427ae3ae585e3735546a2ba409bf2c9dc08 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b4d70967-5f85-485d-a0e7-7c24554aa5c1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dd52e1d662330e2202ccbe94de4907bb9370563f946b0f89a21e947622cef64 +size 136687 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b51604e4-081a-42ca-a218-ea2826f28b44.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b51604e4-081a-42ca-a218-ea2826f28b44.lance new file mode 100644 index 0000000000000000000000000000000000000000..33b991581139d5238fdbf35863454a080000bec9 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b51604e4-081a-42ca-a218-ea2826f28b44.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d12c3a6363d478a643deafa93d6a68335883af2b9e3b3db2cad98298d772368 +size 139806 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b51bfbf1-780d-4eb8-b410-a8d67e74d1f2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b51bfbf1-780d-4eb8-b410-a8d67e74d1f2.lance new file mode 100644 index 0000000000000000000000000000000000000000..106acf42c3d157d8d3d9435251fea92f519fe831 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b51bfbf1-780d-4eb8-b410-a8d67e74d1f2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:135d0efea25c506cf13172f1a73fc25cabef1a788f94c28f0dc04582c5b7f3e9 +size 135776 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b5681048-408a-4f33-ac0d-37f357c33793.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b5681048-408a-4f33-ac0d-37f357c33793.lance new file mode 100644 index 0000000000000000000000000000000000000000..7fcb81ba80522da51250d130b5b9fddd8a977bf2 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b5681048-408a-4f33-ac0d-37f357c33793.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b4799b5ab547aa0884f57909025a5b7ad8cf4f5c17280c6d65ee59094924787 +size 137337 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b59197cb-9cdd-440e-bd8a-360339a698e3.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b59197cb-9cdd-440e-bd8a-360339a698e3.lance new file mode 100644 index 0000000000000000000000000000000000000000..4b91c295555001f3cbf2af3ae2f2b751013a700e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b59197cb-9cdd-440e-bd8a-360339a698e3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2c9ca26890626585090376f5cb5d8e2ca9e45d9fcdde89aa7f6d78ca6362fc1 +size 137521 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b5d6d0c8-1f5f-4ce0-8613-68e14db1ac0b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b5d6d0c8-1f5f-4ce0-8613-68e14db1ac0b.lance new file mode 100644 index 0000000000000000000000000000000000000000..474315b4075c04d57456c4d95a6334926f1cd8d3 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b5d6d0c8-1f5f-4ce0-8613-68e14db1ac0b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48ca1a5ffdf6caa2109d7e04e68b6c535fdcc3b00709506b5fe2462ae27851f3 +size 135602 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b6415934-66e0-4273-9a09-696ff73007ac.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b6415934-66e0-4273-9a09-696ff73007ac.lance new file mode 100644 index 0000000000000000000000000000000000000000..8f4474c43e52c8d84db4f0c182d68350d4262c8d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b6415934-66e0-4273-9a09-696ff73007ac.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa9b37f8c5062bb2485bbfadd85cf788a66e53c871d1d57dc573a7dc9af9eafa +size 136350 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b64b765b-4c7a-4f6a-8a15-391e654dec8e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b64b765b-4c7a-4f6a-8a15-391e654dec8e.lance new file mode 100644 index 0000000000000000000000000000000000000000..c662a621cad089f739f6a01158e58d81e2562d39 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b64b765b-4c7a-4f6a-8a15-391e654dec8e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b715ced9706342001940715b32f00f79268aaa6091a7a034f7166c893da5a48 +size 135909 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b6dc96b8-d525-43e3-b4fa-bd1964633988.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b6dc96b8-d525-43e3-b4fa-bd1964633988.lance new file mode 100644 index 0000000000000000000000000000000000000000..6d208ed8deac30777d74be88603064ce725ff538 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b6dc96b8-d525-43e3-b4fa-bd1964633988.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9d9b4a0bc9c3c8e7df6da72875a205383b8df467b49ff45ec266e3a55742c8 +size 135809 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b7761354-06dd-4997-9547-1cf6975df4b2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b7761354-06dd-4997-9547-1cf6975df4b2.lance new file mode 100644 index 0000000000000000000000000000000000000000..6b58872b8f3423ff87d50ecccbd956ea05bf618a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b7761354-06dd-4997-9547-1cf6975df4b2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0a293408bf87dcd158df75303ad2ee34d9cb8abe8f1b7d60de749211dee97db +size 138117 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b87bb9d6-04eb-4721-9e49-a0bbdc9ae745.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b87bb9d6-04eb-4721-9e49-a0bbdc9ae745.lance new file mode 100644 index 0000000000000000000000000000000000000000..bc8e0f063ead0354d6883ddcc42664028112b232 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b87bb9d6-04eb-4721-9e49-a0bbdc9ae745.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:110ee08741aa893e9fb9fb6ad9b26f7d085cbea24783ab7cd41b96a34818b0d7 +size 138039 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b90b6b26-5df9-47a4-87a8-984122659f88.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b90b6b26-5df9-47a4-87a8-984122659f88.lance new file mode 100644 index 0000000000000000000000000000000000000000..960c93c0d67d1b37ef0e9ec999f9a4ab9a016c9b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b90b6b26-5df9-47a4-87a8-984122659f88.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:379a1fc1952ab5911e2b7732135132ffb60a388b83ded79c37ec960b57c14b96 +size 139142 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/b92dddfc-841d-41c3-8a26-6782e58a477f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/b92dddfc-841d-41c3-8a26-6782e58a477f.lance new file mode 100644 index 0000000000000000000000000000000000000000..b167bd0e508affd97a8582dcc1a5ff75bd6d1b07 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/b92dddfc-841d-41c3-8a26-6782e58a477f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8573c175e28400ec608ae63ac7aa46e6da0ff1820a8713c1bd3fcefa759fd226 +size 134719 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ba900b2e-a28f-47ec-b132-adcfb66f33ee.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ba900b2e-a28f-47ec-b132-adcfb66f33ee.lance new file mode 100644 index 0000000000000000000000000000000000000000..da64ecc7adfda667ae7cc10dbdf0e1d4be95620d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ba900b2e-a28f-47ec-b132-adcfb66f33ee.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5126d1a2de2879b9efb117e37ed54e5078af2b19e6ea7b370aab24994ac29968 +size 139063 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/bba568da-b2e7-4f94-a97c-a9523539a23b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/bba568da-b2e7-4f94-a97c-a9523539a23b.lance new file mode 100644 index 0000000000000000000000000000000000000000..158a20d9578bbbc35a049f78032eb29247b0b53b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/bba568da-b2e7-4f94-a97c-a9523539a23b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45912b881af6a179d284617e1fc09199a7134104229a761b351995b3c56f1460 +size 137017 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/bc32e2c3-136b-4bdb-83f9-15bd70996f78.lance b/.lancedb/nltk_chunking_BAAI.lance/data/bc32e2c3-136b-4bdb-83f9-15bd70996f78.lance new file mode 100644 index 0000000000000000000000000000000000000000..65911dad88a84a5e79363f3a6394542e2e8b26b4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/bc32e2c3-136b-4bdb-83f9-15bd70996f78.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae3885bc8bf33f66a74ddb60566f8e740554b6580b3545f26c03b3b3c7ddbb2e +size 141353 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/bd12d3df-c5cf-4342-b6f8-94f39a4d1f91.lance b/.lancedb/nltk_chunking_BAAI.lance/data/bd12d3df-c5cf-4342-b6f8-94f39a4d1f91.lance new file mode 100644 index 0000000000000000000000000000000000000000..137e234bb444045a677be96ce234aa5d5e89a094 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/bd12d3df-c5cf-4342-b6f8-94f39a4d1f91.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d46f0cccfddd9ae82cad952d55894cc57e28a545cc52056fe3b46044a28b947 +size 135981 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/bd34f9d8-da46-4e2a-8f7b-dcf923a37737.lance b/.lancedb/nltk_chunking_BAAI.lance/data/bd34f9d8-da46-4e2a-8f7b-dcf923a37737.lance new file mode 100644 index 0000000000000000000000000000000000000000..4bf5acc49c68f0e1deeecb7cf474b803676d3703 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/bd34f9d8-da46-4e2a-8f7b-dcf923a37737.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdab77ab184f5f2f6b8b2160d05d76ff625022fa1ea0096af2ecc44e3d07457e +size 137281 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/bf6eee0a-2df1-4d0c-ae31-cb0f82c9b2dc.lance b/.lancedb/nltk_chunking_BAAI.lance/data/bf6eee0a-2df1-4d0c-ae31-cb0f82c9b2dc.lance new file mode 100644 index 0000000000000000000000000000000000000000..7d3c5137ed13b5ff9dbd0255264e9a369e3d2e11 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/bf6eee0a-2df1-4d0c-ae31-cb0f82c9b2dc.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12c2c4e54c0a700adaef4bd771205c2580005358bc0a58e8e6a4c16ef7951cb1 +size 142033 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/bfe9c6a4-a2f7-4c47-be19-3b7a262602e0.lance b/.lancedb/nltk_chunking_BAAI.lance/data/bfe9c6a4-a2f7-4c47-be19-3b7a262602e0.lance new file mode 100644 index 0000000000000000000000000000000000000000..86afba80cb0d018cc0cf914aa3a2f35cb69a6f02 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/bfe9c6a4-a2f7-4c47-be19-3b7a262602e0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aefa1478fc05c1070c9eef8ea641e212a7e4ab4ed3ad84d1db01c5701a456620 +size 138928 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c042cb84-398c-4217-b821-d9c263d14cd2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c042cb84-398c-4217-b821-d9c263d14cd2.lance new file mode 100644 index 0000000000000000000000000000000000000000..2af655a42b746a3e6bac98e1d2fdde79c12c853f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c042cb84-398c-4217-b821-d9c263d14cd2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f0333bfebcda2ff63af9134e4d598c949da4763a3e34e7665babad4f99059a8 +size 136954 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c0d34b85-7084-4b65-8fcd-8ae9b6afbb34.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c0d34b85-7084-4b65-8fcd-8ae9b6afbb34.lance new file mode 100644 index 0000000000000000000000000000000000000000..de00a868f03bc33f3903ec83fe5afe5b1422a120 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c0d34b85-7084-4b65-8fcd-8ae9b6afbb34.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2e4e7d6319deb89e632f8b36f40190f19aebb20b78942204c622f3e1a31e956 +size 138526 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c12b3ac1-63f1-4e77-a29a-4f8e4aca62f8.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c12b3ac1-63f1-4e77-a29a-4f8e4aca62f8.lance new file mode 100644 index 0000000000000000000000000000000000000000..0be21716dce6cf0cf1ae88024186604b425f4504 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c12b3ac1-63f1-4e77-a29a-4f8e4aca62f8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0869b6f7ee8c0b108526a38fdea08810da2a7e1fb3376951e31e059c66ce3913 +size 137965 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c216912a-ee82-48cb-acfc-c2b06e1c0090.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c216912a-ee82-48cb-acfc-c2b06e1c0090.lance new file mode 100644 index 0000000000000000000000000000000000000000..f5e05622b34653122acd1cce79ad59ce2a1c773d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c216912a-ee82-48cb-acfc-c2b06e1c0090.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d36498904fe70d28c06f06c21b7d74e5b1e3736739a9b6eed1f2aa323d6cc55 +size 138138 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c35f69de-891a-49b1-862a-5a4de24cbd0d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c35f69de-891a-49b1-862a-5a4de24cbd0d.lance new file mode 100644 index 0000000000000000000000000000000000000000..c4000ee2753127777ac657531b80596dd6fdf56c --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c35f69de-891a-49b1-862a-5a4de24cbd0d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67699e62e8f66034f70756ed122a3c8aba51e2edf7de81ed82710f4c2a90caeb +size 136308 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c3a802c0-04e5-4857-8ee5-dea8d4d93667.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c3a802c0-04e5-4857-8ee5-dea8d4d93667.lance new file mode 100644 index 0000000000000000000000000000000000000000..3a55ebee48a62e4d81d44aedeeabfe600002581f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c3a802c0-04e5-4857-8ee5-dea8d4d93667.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75e527231bd74e880a9f744e8e40a78ff95d93face54a44f032c3f04ccfef814 +size 136309 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c404d763-32d5-4203-8048-f021d71fadef.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c404d763-32d5-4203-8048-f021d71fadef.lance new file mode 100644 index 0000000000000000000000000000000000000000..99d63f14062b566ced2fc45d29907c75cc4c0b65 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c404d763-32d5-4203-8048-f021d71fadef.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a64cb27710d7f6a05324f0c55f26e78770b6cc4f66582ca5b2197886964e0cf +size 138933 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c4501160-9934-460a-9630-ec9257361435.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c4501160-9934-460a-9630-ec9257361435.lance new file mode 100644 index 0000000000000000000000000000000000000000..b288a894c52e5ba70816bc88883c338bf88ccae5 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c4501160-9934-460a-9630-ec9257361435.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abfb1059dc41d27164a418b617e90ac6289b1f0ce97daa76fd3e34d874dec69b +size 138449 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c49c1d3a-f818-432e-9dd3-334875d755b2.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c49c1d3a-f818-432e-9dd3-334875d755b2.lance new file mode 100644 index 0000000000000000000000000000000000000000..dc369bcdbd715623c294d703490ada809329bb86 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c49c1d3a-f818-432e-9dd3-334875d755b2.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b468400e7fb4e0a929f91841c717d8814b3dfd7353f893434703f4b79e4d0022 +size 137004 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c4bbea73-d9ac-480b-869d-8bdf2578428e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c4bbea73-d9ac-480b-869d-8bdf2578428e.lance new file mode 100644 index 0000000000000000000000000000000000000000..605fd61d8ea826fcd0ef93f699a247ef91b36b21 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c4bbea73-d9ac-480b-869d-8bdf2578428e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fa37c4ef4d71215c0c18cb82b6e2e0bcfdef0386eb2c6d234a75f356a185d12 +size 140955 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c57fa4b4-a062-466e-8797-f5744ba91108.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c57fa4b4-a062-466e-8797-f5744ba91108.lance new file mode 100644 index 0000000000000000000000000000000000000000..e25378d5b0b2a6a26b1fc52e3c773ee847daa300 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c57fa4b4-a062-466e-8797-f5744ba91108.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:362f379d632166b0d799b930830a170d71c7336564672797cc016f6a053635a9 +size 136890 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c8295105-aa31-4a9f-acbd-9afe1e414b55.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c8295105-aa31-4a9f-acbd-9afe1e414b55.lance new file mode 100644 index 0000000000000000000000000000000000000000..b1331403daf184d24be217c548d6eff621b374a3 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c8295105-aa31-4a9f-acbd-9afe1e414b55.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb312d2c236bfb83cd6a999b548fef0cac55e9ebfd49396fc41d18447348fe20 +size 137728 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c89a4fac-479b-45b9-9979-18023d256580.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c89a4fac-479b-45b9-9979-18023d256580.lance new file mode 100644 index 0000000000000000000000000000000000000000..ca719fef44bdcb6f4a453296eb675bde1b7cbbf0 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c89a4fac-479b-45b9-9979-18023d256580.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f7a5a2a08301683578f8785e3a0ca789d0ca149e68a2b69df3163814586c36 +size 139147 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c946add2-7b5d-42e6-8b10-dafb53a3e5c6.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c946add2-7b5d-42e6-8b10-dafb53a3e5c6.lance new file mode 100644 index 0000000000000000000000000000000000000000..350ae52cfee9298089246c5549c46e505dfa1c78 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c946add2-7b5d-42e6-8b10-dafb53a3e5c6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b1734a63506a7261223650cd8550f18c947ad98d42d91b4b515c79d39e18089 +size 137750 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/c9693cf9-ec97-4dd1-97e9-957a29ebbf9b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/c9693cf9-ec97-4dd1-97e9-957a29ebbf9b.lance new file mode 100644 index 0000000000000000000000000000000000000000..5941aea2ddc64bba2307aa4dbf77ea2c6cedb40e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/c9693cf9-ec97-4dd1-97e9-957a29ebbf9b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd708c1341725c7e9269dd19e9caeedbfb1780fffc3f901a69d2692b2c5aead7 +size 137780 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ca4882e4-a5a7-4100-b580-d5e5a1322340.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ca4882e4-a5a7-4100-b580-d5e5a1322340.lance new file mode 100644 index 0000000000000000000000000000000000000000..5d0231657dd2f259af6af276c5fd52c62eeab00f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ca4882e4-a5a7-4100-b580-d5e5a1322340.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bc0380a9c82b51f13f198d46a6497fcf3fa43b5dec5a040116bf0d59db5bde0 +size 137972 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ca736a0f-fd0d-4e71-b65d-ea9440a937c1.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ca736a0f-fd0d-4e71-b65d-ea9440a937c1.lance new file mode 100644 index 0000000000000000000000000000000000000000..0d9358b931fcae50d0afc8bf56519aa1fee2909d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ca736a0f-fd0d-4e71-b65d-ea9440a937c1.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30d6b21ba84377ec7b4c4aab8917a5a69fff094316a1aacefc813b56e89728d2 +size 137657 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/cadcae28-4cbd-472f-9b6e-922cc9a20201.lance b/.lancedb/nltk_chunking_BAAI.lance/data/cadcae28-4cbd-472f-9b6e-922cc9a20201.lance new file mode 100644 index 0000000000000000000000000000000000000000..57b096e3d087837f3340b985267a8c300e083736 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/cadcae28-4cbd-472f-9b6e-922cc9a20201.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e652c1213976b019ff869a943dcbb6c6b0ed309ddaf0d144cbaee2f5b293614 +size 139849 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/cc89a78e-769e-4e7d-9d5d-272f71f32573.lance b/.lancedb/nltk_chunking_BAAI.lance/data/cc89a78e-769e-4e7d-9d5d-272f71f32573.lance new file mode 100644 index 0000000000000000000000000000000000000000..8627e0c4f1b310ea85d8b36f7b3dc2d3c89e63a8 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/cc89a78e-769e-4e7d-9d5d-272f71f32573.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd2655e3e73771f068f0fdd0798b08ba2e6f3e49f2955ef0655b47bf8fd8028f +size 135873 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/cca63015-f2e4-4114-b6c4-81039c71b5d8.lance b/.lancedb/nltk_chunking_BAAI.lance/data/cca63015-f2e4-4114-b6c4-81039c71b5d8.lance new file mode 100644 index 0000000000000000000000000000000000000000..392cac0c2fd498e8ba2a079d42da835df86e5c89 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/cca63015-f2e4-4114-b6c4-81039c71b5d8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:708b983c996b1c157f2cca6340215868025ad61126e4842e40bb0d3352b415e8 +size 143711 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ccf431ce-e7d1-4fc1-8371-a451c98ab3aa.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ccf431ce-e7d1-4fc1-8371-a451c98ab3aa.lance new file mode 100644 index 0000000000000000000000000000000000000000..03746e3cb8bccd93c3468b04e8d7563fbcb35e65 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ccf431ce-e7d1-4fc1-8371-a451c98ab3aa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1e55449ad29a5f5348f2e1ca3bb7c71ab5278b2e743c6d58db912dcc138e500 +size 138443 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/cd1e09c1-4f49-4f14-b201-060ffb2d1ffb.lance b/.lancedb/nltk_chunking_BAAI.lance/data/cd1e09c1-4f49-4f14-b201-060ffb2d1ffb.lance new file mode 100644 index 0000000000000000000000000000000000000000..30c6b9ea74cb7a3b99d46eb3a67c2bb804635c4f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/cd1e09c1-4f49-4f14-b201-060ffb2d1ffb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:218d78a61bc8acc515ea28b0f99c79d3d0f8adc1ed01a859863218109a0ce65b +size 139754 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ce247c2a-aede-4408-86ee-d80e080a73d6.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ce247c2a-aede-4408-86ee-d80e080a73d6.lance new file mode 100644 index 0000000000000000000000000000000000000000..261e2d3235ce1e6d7d35d99eea171666b2cf2167 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ce247c2a-aede-4408-86ee-d80e080a73d6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae82950778eb276493ce5519ef2eb29ea6385045b5f4c63d6f4f48c1977a23f4 +size 138731 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ce54d6a2-0750-4232-b42b-45cf52529e1b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ce54d6a2-0750-4232-b42b-45cf52529e1b.lance new file mode 100644 index 0000000000000000000000000000000000000000..142e4823e8050d8e1c02654c81fd29f49e339334 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ce54d6a2-0750-4232-b42b-45cf52529e1b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6542631cdc4a224b4aab2572d758df71ca8a0d51a1c92d93e914c355b17b3946 +size 136501 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ceb78039-978f-4ee5-8926-65a5382bb5be.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ceb78039-978f-4ee5-8926-65a5382bb5be.lance new file mode 100644 index 0000000000000000000000000000000000000000..b3dff23e3d984db640c1a536b21df8984c59e1f1 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ceb78039-978f-4ee5-8926-65a5382bb5be.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e4cba40fb81d6771b64f6795ec3a8eea2699fc56955a6b68876a37c2361e416 +size 137947 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/cf2f2e60-f0b9-476f-a1d6-f4ab7e108d50.lance b/.lancedb/nltk_chunking_BAAI.lance/data/cf2f2e60-f0b9-476f-a1d6-f4ab7e108d50.lance new file mode 100644 index 0000000000000000000000000000000000000000..3a73e1a301b6b5335311f2eb8f11af5ffeea204d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/cf2f2e60-f0b9-476f-a1d6-f4ab7e108d50.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32e4d8489d5fe95a4017bff075782dd3d121acd1ed2c03fe6bf36ad79114774e +size 136852 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/d0b5d962-71e7-49e9-95af-ec3018b60b61.lance b/.lancedb/nltk_chunking_BAAI.lance/data/d0b5d962-71e7-49e9-95af-ec3018b60b61.lance new file mode 100644 index 0000000000000000000000000000000000000000..7376f8736d90a0f42ba144b2d18458edbebd5e05 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/d0b5d962-71e7-49e9-95af-ec3018b60b61.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:807615f56765e03590f3e828177a4ea01997c58b34f72030f8f6ec39822b90e2 +size 137076 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/d1184237-d921-4d34-9e92-20f8f2393935.lance b/.lancedb/nltk_chunking_BAAI.lance/data/d1184237-d921-4d34-9e92-20f8f2393935.lance new file mode 100644 index 0000000000000000000000000000000000000000..e8c9150157d656588c2bacb43bb4cab89c8fd5b3 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/d1184237-d921-4d34-9e92-20f8f2393935.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9efc31f0beeb1891f6f5480de3367b4cc99a1823ad0cc7d91ea400dc265f48e1 +size 137558 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/d1af5185-61b0-4719-abea-92de165b9808.lance b/.lancedb/nltk_chunking_BAAI.lance/data/d1af5185-61b0-4719-abea-92de165b9808.lance new file mode 100644 index 0000000000000000000000000000000000000000..31b0d158d060f1b29ea08c2dfea304833297cbaf --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/d1af5185-61b0-4719-abea-92de165b9808.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6db9f74493e83b6bc0f849c706b12b2a96e1b45f37d29d50fb088bab7be8fcf +size 137586 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/d4e2a906-fce1-45b0-87de-6d6730f15a85.lance b/.lancedb/nltk_chunking_BAAI.lance/data/d4e2a906-fce1-45b0-87de-6d6730f15a85.lance new file mode 100644 index 0000000000000000000000000000000000000000..ffd9651e64457497cbf25938f042acc95d76e6c4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/d4e2a906-fce1-45b0-87de-6d6730f15a85.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2c3fdd6b33681155cde01f29272d20394c7ec9de073d71966e9278269e5bd6c +size 136368 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/d5de3413-8435-4376-98f7-e3e4f565dc70.lance b/.lancedb/nltk_chunking_BAAI.lance/data/d5de3413-8435-4376-98f7-e3e4f565dc70.lance new file mode 100644 index 0000000000000000000000000000000000000000..0e7d0168ec44b922a4b930dade74693a2ed25cd4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/d5de3413-8435-4376-98f7-e3e4f565dc70.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cae3216e2e1a5d23e19546c7f18420a69907d7cc20db7bec7f839e3f7715a18 +size 137829 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/d5f5f79b-874b-40e6-8c70-5a50f0eb3167.lance b/.lancedb/nltk_chunking_BAAI.lance/data/d5f5f79b-874b-40e6-8c70-5a50f0eb3167.lance new file mode 100644 index 0000000000000000000000000000000000000000..a27a3e74be7871e1800e53a6fd6bec8844921c7e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/d5f5f79b-874b-40e6-8c70-5a50f0eb3167.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dba9486aabc7ee7a57c4a90e7e16b1a8e1943fb125f9bc50a0ed770b8d13bcc +size 138543 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/d7a2c8b5-d318-4637-a58a-51d3edab8f45.lance b/.lancedb/nltk_chunking_BAAI.lance/data/d7a2c8b5-d318-4637-a58a-51d3edab8f45.lance new file mode 100644 index 0000000000000000000000000000000000000000..1b26455cd4240e7d0fb313fd8519d3cb5f9ed14e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/d7a2c8b5-d318-4637-a58a-51d3edab8f45.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16e04a6df278c1f2ebc6cd5121ee90e70c3c670e726163207592f28a773b5f0b +size 138162 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/d7d6cb71-7105-4b9e-a335-84e292a16a7a.lance b/.lancedb/nltk_chunking_BAAI.lance/data/d7d6cb71-7105-4b9e-a335-84e292a16a7a.lance new file mode 100644 index 0000000000000000000000000000000000000000..e86aff009f7ee44dd8090e0fd9ddee753c40aee6 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/d7d6cb71-7105-4b9e-a335-84e292a16a7a.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0520234caddd5913241d4906de079baedeed8ac30b99f4bb48ad51f4052d5683 +size 135779 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/d812d58b-485b-4b30-9ac8-eb8b4674eff8.lance b/.lancedb/nltk_chunking_BAAI.lance/data/d812d58b-485b-4b30-9ac8-eb8b4674eff8.lance new file mode 100644 index 0000000000000000000000000000000000000000..e438c3ed999db5c41ab8c912de9ecb9066721399 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/d812d58b-485b-4b30-9ac8-eb8b4674eff8.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ceb667cd968ab5fd8f2f9221395a8db90ea4f6c15e997b953628add8af67ff7 +size 137934 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/d89ff3df-4a89-42a6-be9d-ab2efac5606c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/d89ff3df-4a89-42a6-be9d-ab2efac5606c.lance new file mode 100644 index 0000000000000000000000000000000000000000..56c0d8665445cc436a63e5d3092841288563b019 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/d89ff3df-4a89-42a6-be9d-ab2efac5606c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09b1cf649b15f1cc6fd186f3aec087a16566ce2fd708ceae61b863ffb9e6848b +size 135678 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/d90beca8-ac7a-49ca-8ee6-98c2008fc4c7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/d90beca8-ac7a-49ca-8ee6-98c2008fc4c7.lance new file mode 100644 index 0000000000000000000000000000000000000000..78507174ea0e2ca617d20880f3696faff9d2fb6a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/d90beca8-ac7a-49ca-8ee6-98c2008fc4c7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b2f17ae2b1ff00829970c72dbb66d55eca438c50c0d43a2fb730d6df4086238 +size 135283 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/da886579-3223-42cd-a00e-456f1921c950.lance b/.lancedb/nltk_chunking_BAAI.lance/data/da886579-3223-42cd-a00e-456f1921c950.lance new file mode 100644 index 0000000000000000000000000000000000000000..28534e939ad133fc7a0def84ab6adc4f93a8f0d2 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/da886579-3223-42cd-a00e-456f1921c950.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:237bd98c4bf320140686d8fc1e2c226dae87fd814594a506e53eff4560aeca71 +size 137580 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/da8efe49-60f2-44f1-b821-dedf5ad91660.lance b/.lancedb/nltk_chunking_BAAI.lance/data/da8efe49-60f2-44f1-b821-dedf5ad91660.lance new file mode 100644 index 0000000000000000000000000000000000000000..9f81ec3dd5e35d52b8a229e660dca91b30e2d336 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/da8efe49-60f2-44f1-b821-dedf5ad91660.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dfaaf308413323aacb93eef1bcc43c4940c2b995b44b27534217dfcaf080462 +size 138876 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/da934ea6-2a4f-4ba0-9b73-c42e591b5ad4.lance b/.lancedb/nltk_chunking_BAAI.lance/data/da934ea6-2a4f-4ba0-9b73-c42e591b5ad4.lance new file mode 100644 index 0000000000000000000000000000000000000000..c8e935aea293a77a3aab1c3f6f224d64a4770d2d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/da934ea6-2a4f-4ba0-9b73-c42e591b5ad4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b04183f34be56f8746f80324b1d2bd2976bb016ca83d25393d69f8084326561f +size 139619 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/db1243f7-f6ba-4476-898f-ef99c0272b3e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/db1243f7-f6ba-4476-898f-ef99c0272b3e.lance new file mode 100644 index 0000000000000000000000000000000000000000..ec10e771fccb488bd95ed956a980a80b717a7c06 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/db1243f7-f6ba-4476-898f-ef99c0272b3e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dcdbc2d3432f931004ce4dcaf01cb2456348bc2c16fc9497fb343f096d2d4f3 +size 137594 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/dc9701fe-6534-4580-85e4-990d0a7228aa.lance b/.lancedb/nltk_chunking_BAAI.lance/data/dc9701fe-6534-4580-85e4-990d0a7228aa.lance new file mode 100644 index 0000000000000000000000000000000000000000..70f2da67741189d895f194196de4f1375d72cadd --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/dc9701fe-6534-4580-85e4-990d0a7228aa.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:670b2cf8578fddd49550240e5d4d3b0f86fe21a5d81d068ee133cda0774430b3 +size 136815 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/dd394e52-43d9-45e0-8642-074cd9763bbd.lance b/.lancedb/nltk_chunking_BAAI.lance/data/dd394e52-43d9-45e0-8642-074cd9763bbd.lance new file mode 100644 index 0000000000000000000000000000000000000000..8a7742da5cc041d16c17ab22bb6bf96ecc447a9b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/dd394e52-43d9-45e0-8642-074cd9763bbd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1f9bb1df83f0b648a8008c445cf070da6ef23a66c30a22e8c173056719a553a +size 144695 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/dd8af1ca-6cd1-4b30-81ac-10b8950c9d1f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/dd8af1ca-6cd1-4b30-81ac-10b8950c9d1f.lance new file mode 100644 index 0000000000000000000000000000000000000000..4e102d95d1fe9f89016bb46c3a5d2d7b59e08682 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/dd8af1ca-6cd1-4b30-81ac-10b8950c9d1f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05d2a4976cda8d46412d6eab3e3923880612ec22ea723a36ec95c0fb1c2f1f98 +size 140346 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/de40221a-61c5-4e92-985a-b836b5c8e020.lance b/.lancedb/nltk_chunking_BAAI.lance/data/de40221a-61c5-4e92-985a-b836b5c8e020.lance new file mode 100644 index 0000000000000000000000000000000000000000..033f20c032c7653938b1f94d26196606ad987879 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/de40221a-61c5-4e92-985a-b836b5c8e020.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44b4703ac59f1c05db859dec038683d18d6af0f84a196eabc8e2cabc519440e8 +size 137626 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/df7de426-d18d-4775-92f5-4b37a4005611.lance b/.lancedb/nltk_chunking_BAAI.lance/data/df7de426-d18d-4775-92f5-4b37a4005611.lance new file mode 100644 index 0000000000000000000000000000000000000000..3b367d080236fe0a4c48182d004a058bcb59862a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/df7de426-d18d-4775-92f5-4b37a4005611.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa3a0d431d9d4b5991ebb4782a8110a4ad38dd48c79cdc25c964829b6d51af19 +size 136182 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/e04def6f-3473-41e6-b0ff-1a685844c908.lance b/.lancedb/nltk_chunking_BAAI.lance/data/e04def6f-3473-41e6-b0ff-1a685844c908.lance new file mode 100644 index 0000000000000000000000000000000000000000..fd838e5f11758c05f565eea2876c25b0cff01988 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/e04def6f-3473-41e6-b0ff-1a685844c908.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a411b4b608d15864f3fb83ef8b6665feb6127f85b59d2d1210cfb866fb62971 +size 137588 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/e0aac18d-9ca3-4522-989d-6828abe4244c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/e0aac18d-9ca3-4522-989d-6828abe4244c.lance new file mode 100644 index 0000000000000000000000000000000000000000..7936c118b7eca13ff51d5b3e682dbd7cd6851f78 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/e0aac18d-9ca3-4522-989d-6828abe4244c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c22d891bc6b2103ce5075f00ff009506074238255cbe9640ff4b473e9695da3f +size 137400 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/e1729164-136f-4e28-a170-6777092ad175.lance b/.lancedb/nltk_chunking_BAAI.lance/data/e1729164-136f-4e28-a170-6777092ad175.lance new file mode 100644 index 0000000000000000000000000000000000000000..eeb0ececace57c9461bdbe4b5b4498a90812b96f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/e1729164-136f-4e28-a170-6777092ad175.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b5a87d1e6de3853c8e02bbefb7d7435fc4a8b1aed400fbdb34247e835c3f53b +size 136489 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/e43e1599-1f35-4e9f-8e3b-8566c82904da.lance b/.lancedb/nltk_chunking_BAAI.lance/data/e43e1599-1f35-4e9f-8e3b-8566c82904da.lance new file mode 100644 index 0000000000000000000000000000000000000000..11a5930ec40f3316146e7126166efd98ed0d72d5 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/e43e1599-1f35-4e9f-8e3b-8566c82904da.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9dd756bf147982da6798f1a1d18bf52ea3d73cf3dd9bca8f6fb10fc8383760a +size 138656 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/e4cf5e5b-9853-4321-a934-bf2f6ffd720c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/e4cf5e5b-9853-4321-a934-bf2f6ffd720c.lance new file mode 100644 index 0000000000000000000000000000000000000000..c10b1ec4c6f8c6ebce0d92f2d3d0a398bf49dbb7 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/e4cf5e5b-9853-4321-a934-bf2f6ffd720c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5675856ab4856b0bc0819e3587efe7f13093938dd75aa936961fb0c64316f63b +size 137426 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/e4effecc-9d2b-41f3-b707-fb6612df2182.lance b/.lancedb/nltk_chunking_BAAI.lance/data/e4effecc-9d2b-41f3-b707-fb6612df2182.lance new file mode 100644 index 0000000000000000000000000000000000000000..6c21101f210afd20a5e0ae49e563c8d408844102 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/e4effecc-9d2b-41f3-b707-fb6612df2182.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2949ee7d086cf35cd6e736e133485693bd537ec1130f29a05ebe0693d71146b1 +size 140645 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/e51b555e-9cec-4733-8192-f55e381ffd3f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/e51b555e-9cec-4733-8192-f55e381ffd3f.lance new file mode 100644 index 0000000000000000000000000000000000000000..268546f6cc0f56b337d67c73dd02496b666c7960 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/e51b555e-9cec-4733-8192-f55e381ffd3f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db070113509decba1dd284f4bfebe5db849b1ce0b18e77360e15cf3a9bfc4d45 +size 138293 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/e5428f17-c89a-4fc2-8a7e-88aac7181a43.lance b/.lancedb/nltk_chunking_BAAI.lance/data/e5428f17-c89a-4fc2-8a7e-88aac7181a43.lance new file mode 100644 index 0000000000000000000000000000000000000000..9b9497c81778fc6aff87a9218933fc7ec6ff4873 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/e5428f17-c89a-4fc2-8a7e-88aac7181a43.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26f8c7400cc47bec28e7ff546c9e67cc45b6cd06a04a9f24c86e5a4ff04a666f +size 136783 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/e60c0fba-9322-485e-9a29-354a3686bfeb.lance b/.lancedb/nltk_chunking_BAAI.lance/data/e60c0fba-9322-485e-9a29-354a3686bfeb.lance new file mode 100644 index 0000000000000000000000000000000000000000..d10d6f36e22f99a72602fdec94cad85862b2fbf5 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/e60c0fba-9322-485e-9a29-354a3686bfeb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99938c0877c06232404f0c046f5fd22462757ce87220a29cbb01455e267e9085 +size 136844 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/e6586716-ce33-4368-83e5-7a482d4abb02.lance b/.lancedb/nltk_chunking_BAAI.lance/data/e6586716-ce33-4368-83e5-7a482d4abb02.lance new file mode 100644 index 0000000000000000000000000000000000000000..8a130514fb14035bc4d37c83d8b4fcbd1e629669 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/e6586716-ce33-4368-83e5-7a482d4abb02.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8c53c2aaff08a97cbfd3831bc1e8dce3d99801164af6c061029a95583eb4ffb +size 136218 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/e700564f-2869-4cee-86a8-0612a812b7f4.lance b/.lancedb/nltk_chunking_BAAI.lance/data/e700564f-2869-4cee-86a8-0612a812b7f4.lance new file mode 100644 index 0000000000000000000000000000000000000000..5b365ded936f83ba52d860172a10d42a1c02f07a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/e700564f-2869-4cee-86a8-0612a812b7f4.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fe725269eac06136bed4d4b047ba58e095785762ba697d26756561514630512 +size 139308 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ea847b93-4d85-4c63-b7bb-5561937738dd.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ea847b93-4d85-4c63-b7bb-5561937738dd.lance new file mode 100644 index 0000000000000000000000000000000000000000..7c1663e35385cbc564f31184344bb5a51a93c50f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ea847b93-4d85-4c63-b7bb-5561937738dd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81d0fcab22b88712482b4d90c52662c32385651899a836fadeab91776b1728c0 +size 135332 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/eaf84a43-b4ee-4a1a-a54b-afe3f8b06e5e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/eaf84a43-b4ee-4a1a-a54b-afe3f8b06e5e.lance new file mode 100644 index 0000000000000000000000000000000000000000..565398090a48c74f959f076168db4d17a8d2c300 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/eaf84a43-b4ee-4a1a-a54b-afe3f8b06e5e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1705705a7813282d97649424199cc99bafa4a60cb52bbb16ea4aa5b35d07a662 +size 137174 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ed42e4d5-5d0e-46d1-a834-42c7c85663c6.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ed42e4d5-5d0e-46d1-a834-42c7c85663c6.lance new file mode 100644 index 0000000000000000000000000000000000000000..92f99355088275f7d0e5805ba15791730afb03c0 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ed42e4d5-5d0e-46d1-a834-42c7c85663c6.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3a402c4a7b87cde13eaf0bbf76175534cb166934d63f2b2b5fcb25cf1fdd74d +size 139009 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ed7445c0-227a-4a2c-9755-dd992f039289.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ed7445c0-227a-4a2c-9755-dd992f039289.lance new file mode 100644 index 0000000000000000000000000000000000000000..bd3219fbc5bb14c8767130726092e58e42b5d5c4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ed7445c0-227a-4a2c-9755-dd992f039289.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:857929c9048fc214aaa80e41877277a80124ccbc791289cb6b00c88ae0938bd1 +size 137308 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ee5c3d9a-092d-489b-abbf-7003963b14fe.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ee5c3d9a-092d-489b-abbf-7003963b14fe.lance new file mode 100644 index 0000000000000000000000000000000000000000..846a2ce563d6854502afc034391bb12b5b1335b3 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ee5c3d9a-092d-489b-abbf-7003963b14fe.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8732ccb12d6f9e8c5adffbc2869f803f666c4e86d5e8a6dc8592644b7b2743 +size 136689 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/eed24429-a8ec-4156-b138-bf4f3e3ac24b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/eed24429-a8ec-4156-b138-bf4f3e3ac24b.lance new file mode 100644 index 0000000000000000000000000000000000000000..be7a33071fcaec37feab77da04da1f809affe971 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/eed24429-a8ec-4156-b138-bf4f3e3ac24b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:092033ec84a521da305c3f7f3356e860b66173b0a3c6dd0ffd99715e82a506f3 +size 136527 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ef832df4-26cf-473a-9076-69d214af4b46.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ef832df4-26cf-473a-9076-69d214af4b46.lance new file mode 100644 index 0000000000000000000000000000000000000000..31b8500f1839f22bb9d0aa6b2c015e2d8f1cb61e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ef832df4-26cf-473a-9076-69d214af4b46.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec6a04c73ac807fdf8e324ea95b65aec23cbdd020a18b0e59bfea65bda899e81 +size 136738 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/efd9a2e7-9ce3-41ab-b5b7-82134633ad3d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/efd9a2e7-9ce3-41ab-b5b7-82134633ad3d.lance new file mode 100644 index 0000000000000000000000000000000000000000..341cd86ff62fcc105102d915cd0bf24c3e58812b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/efd9a2e7-9ce3-41ab-b5b7-82134633ad3d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7cc84bae6ee8e63e276df338698197d2c216d13bbe8c45cb73a07ea6e4c792a +size 137515 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/efe38025-950c-4f7b-a89c-d019b512f10d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/efe38025-950c-4f7b-a89c-d019b512f10d.lance new file mode 100644 index 0000000000000000000000000000000000000000..3e9beff116103e9c1c5a66645ff67fde2a5e8d4a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/efe38025-950c-4f7b-a89c-d019b512f10d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e37219796a6ec809f165c468b1ac62adb2f466fc3abb6ef30b2a61185645953 +size 138312 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f0f790ef-06a0-47d2-94b0-d5962162372c.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f0f790ef-06a0-47d2-94b0-d5962162372c.lance new file mode 100644 index 0000000000000000000000000000000000000000..fcc4c09f4089f3a7fff9a2cbe01c9310f89024a2 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f0f790ef-06a0-47d2-94b0-d5962162372c.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bda9bb44b8b3d15371e677c47a4c0c4eb4356920d4eae4c54ab91759586607c2 +size 137840 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f11df924-f5e0-4e0c-afda-e9de9e9eaf63.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f11df924-f5e0-4e0c-afda-e9de9e9eaf63.lance new file mode 100644 index 0000000000000000000000000000000000000000..016ca66c647d84ca2f8bbd3b5235a2ea35fba0ff --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f11df924-f5e0-4e0c-afda-e9de9e9eaf63.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6593905842ffb11909494d8197326a4e998c032da8f931be54e59c792237204f +size 135714 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f1661108-9ef7-4440-8c1c-a3c3a5de314e.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f1661108-9ef7-4440-8c1c-a3c3a5de314e.lance new file mode 100644 index 0000000000000000000000000000000000000000..5c5e1c8b73a1113c73ad5d94e35278388286895e --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f1661108-9ef7-4440-8c1c-a3c3a5de314e.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9266fa6910a3268761b4626d803aa88cd025e53f59c59a8be0a3017e5bca8a8d +size 138238 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f1a1ca02-b133-4564-8d78-ae0b30383626.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f1a1ca02-b133-4564-8d78-ae0b30383626.lance new file mode 100644 index 0000000000000000000000000000000000000000..1ffedc7973eb0d8505464c41138dabe6837c2760 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f1a1ca02-b133-4564-8d78-ae0b30383626.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6601a61485b68fed921b1861534d7e13fb99e0c969309418375cf5ade7b479a8 +size 139377 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f1b90d01-b538-4444-9a95-a49bfc561f67.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f1b90d01-b538-4444-9a95-a49bfc561f67.lance new file mode 100644 index 0000000000000000000000000000000000000000..9568f3ef96044a255da3ce5fb154f928d6e330e7 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f1b90d01-b538-4444-9a95-a49bfc561f67.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03005fb0016a32ca8fd94e527961b96d106fbf001279672925e6f0b629552b53 +size 139412 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f22dfde6-ffea-46fe-8736-c518512e4057.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f22dfde6-ffea-46fe-8736-c518512e4057.lance new file mode 100644 index 0000000000000000000000000000000000000000..b01d58b9267e9d5138e8814412055213a38403a0 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f22dfde6-ffea-46fe-8736-c518512e4057.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e94f4ff14d22d21f29213204d48a52c19ccaacbc2b01114c5f13d18f326f6cb +size 137071 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f2892adf-afd6-41c1-8c6f-4f6a19f3f70d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f2892adf-afd6-41c1-8c6f-4f6a19f3f70d.lance new file mode 100644 index 0000000000000000000000000000000000000000..aeb50115882d02cedf6f9403e05370bc396737bf --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f2892adf-afd6-41c1-8c6f-4f6a19f3f70d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62a86b92f059cad2d04f042aedad09dde67c212cc27664130c9077bca6cead98 +size 137770 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f3cb97b7-aacb-4223-ad7e-96f8b6fd900f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f3cb97b7-aacb-4223-ad7e-96f8b6fd900f.lance new file mode 100644 index 0000000000000000000000000000000000000000..84f2d583426a4d96a5a0143b7a4e75dd29c5110b --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f3cb97b7-aacb-4223-ad7e-96f8b6fd900f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b721db414ad302cc5cbb6e16fd4e35958a1a403fea18fea407091f06c1f54704 +size 136005 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f4904a3c-f5a9-4910-9649-43b4d71e03c9.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f4904a3c-f5a9-4910-9649-43b4d71e03c9.lance new file mode 100644 index 0000000000000000000000000000000000000000..8741914516c7496fd0be23a8b12a77cb2ad5b716 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f4904a3c-f5a9-4910-9649-43b4d71e03c9.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9f1d9d92ae12d60a0f73c1f74feeb9c2ea773bf8b254446c1b21ef1aa211855 +size 134807 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f52b284d-dbf4-4f65-9075-4b9b40d8484d.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f52b284d-dbf4-4f65-9075-4b9b40d8484d.lance new file mode 100644 index 0000000000000000000000000000000000000000..a39f0f294ae0ef3c0317a194bf13e1c3cdf54e3a --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f52b284d-dbf4-4f65-9075-4b9b40d8484d.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6067f68528cf70f71a13759147dafc1673a367219517b8735d0dd08170e9cda3 +size 137272 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f6d76da0-4730-49e5-8db0-c9a4fddd36bd.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f6d76da0-4730-49e5-8db0-c9a4fddd36bd.lance new file mode 100644 index 0000000000000000000000000000000000000000..de00b523ee78fa7a976addbd1a8bf97aae3d1849 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f6d76da0-4730-49e5-8db0-c9a4fddd36bd.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5befbde3d2f5903deb3260bfa5e63e940321daf7b64dd938176a9c6f7ae58f47 +size 137531 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f6da5e94-fec0-484c-9814-32e0412dc9b7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f6da5e94-fec0-484c-9814-32e0412dc9b7.lance new file mode 100644 index 0000000000000000000000000000000000000000..11ef5ecb8bd09cbda36ea8cebe333b49cfaa8436 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f6da5e94-fec0-484c-9814-32e0412dc9b7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c88c696f217fd139fe05ec28b278f587b3c5c1d6a25acd0d80c0aada9b66a23 +size 136547 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f6db4b28-9871-4ecd-b8cf-658b07219c7f.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f6db4b28-9871-4ecd-b8cf-658b07219c7f.lance new file mode 100644 index 0000000000000000000000000000000000000000..268e0e781b5a6802d0639cc1ae58b99b504b9898 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f6db4b28-9871-4ecd-b8cf-658b07219c7f.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bdb4e7c3aadafdedc8d9b03d38c8cce8a453fdabc3e27b46b89e58413809c88f +size 139316 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f73e5298-370a-4e2a-88de-87591bde366b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f73e5298-370a-4e2a-88de-87591bde366b.lance new file mode 100644 index 0000000000000000000000000000000000000000..cad865ca5ae82b4ca6f7c0514d463f4347f041c0 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f73e5298-370a-4e2a-88de-87591bde366b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:986d8db028535e5aa4b3d80fbe380bf17d59a900675baddc1125ed4818a27774 +size 138572 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f75acf97-eb74-47a2-ae13-2840b1941c12.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f75acf97-eb74-47a2-ae13-2840b1941c12.lance new file mode 100644 index 0000000000000000000000000000000000000000..15370a553af18a37736afc61d09688fd57083d43 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f75acf97-eb74-47a2-ae13-2840b1941c12.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:335c7598af527ebfe65cf103a3b0170d71d318a9b96ae8c1741c9e327ce7d1ea +size 140085 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f7946c66-eef8-419d-b1db-a51cedd88bb3.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f7946c66-eef8-419d-b1db-a51cedd88bb3.lance new file mode 100644 index 0000000000000000000000000000000000000000..d573f54e36fa2ac60789f8d44f89f20ed9209358 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f7946c66-eef8-419d-b1db-a51cedd88bb3.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d96f030d0472f80278d9814267f2ee0017ce789c83146a63e8b3c64e080d120 +size 140279 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f7d1fa91-9ec0-40ca-a7f2-beabe770e1b5.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f7d1fa91-9ec0-40ca-a7f2-beabe770e1b5.lance new file mode 100644 index 0000000000000000000000000000000000000000..407c687c960dc64496da803cb32e1e3555c804d8 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f7d1fa91-9ec0-40ca-a7f2-beabe770e1b5.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a9ff43a07f80e94f774778e0e1e2903528be11c4992a160b6ccf0ee5d40656d +size 138596 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f8870d60-2d85-4831-8dc6-088aaac9a68b.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f8870d60-2d85-4831-8dc6-088aaac9a68b.lance new file mode 100644 index 0000000000000000000000000000000000000000..777ef45399862bbba5766a603b1367c99aa38d5f --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f8870d60-2d85-4831-8dc6-088aaac9a68b.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfdd6f3be5aa818c55cc2281293f65cda994fb46282578df512264ab299cc909 +size 135549 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/f89ccc14-e872-4cbb-bbe9-772484d02d72.lance b/.lancedb/nltk_chunking_BAAI.lance/data/f89ccc14-e872-4cbb-bbe9-772484d02d72.lance new file mode 100644 index 0000000000000000000000000000000000000000..526b9bbbc459b4a10f48ed10724946692ed1ef62 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/f89ccc14-e872-4cbb-bbe9-772484d02d72.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df70f139dabf26a0f4a7ecef06a0062389192d56b2b91b8724fa8d68fca6207a +size 140875 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/facefdcf-6d2d-48bd-b6bc-3d5a96f5e2f0.lance b/.lancedb/nltk_chunking_BAAI.lance/data/facefdcf-6d2d-48bd-b6bc-3d5a96f5e2f0.lance new file mode 100644 index 0000000000000000000000000000000000000000..4a0f2ed4866202e8004158e9c524cc5624328d75 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/facefdcf-6d2d-48bd-b6bc-3d5a96f5e2f0.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c64f5102dce01234b331b3d8c2f407b401817a5b287d88b8daf7c400f48ac47b +size 136500 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/fb5f8155-e080-4921-95bd-a0508f01e089.lance b/.lancedb/nltk_chunking_BAAI.lance/data/fb5f8155-e080-4921-95bd-a0508f01e089.lance new file mode 100644 index 0000000000000000000000000000000000000000..b934242b7d8a169bfb5ca12d5b1fb09c000117d4 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/fb5f8155-e080-4921-95bd-a0508f01e089.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fc58d7106c70dc88bf0f557fbd2b34c84da9dae0f2b2412ecdc1e73cbe9ee8c +size 135556 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/fbf63155-521b-4386-97a9-8a8964993765.lance b/.lancedb/nltk_chunking_BAAI.lance/data/fbf63155-521b-4386-97a9-8a8964993765.lance new file mode 100644 index 0000000000000000000000000000000000000000..b000f690e7d27c40d96f99117c89ecba81095e0d --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/fbf63155-521b-4386-97a9-8a8964993765.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63c5f1b406a15bc69017ed4426438af4684a714e56175bc79188ec2788e3f6fa +size 137626 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/fc4e526a-9aef-4cfd-84dd-87562502fabb.lance b/.lancedb/nltk_chunking_BAAI.lance/data/fc4e526a-9aef-4cfd-84dd-87562502fabb.lance new file mode 100644 index 0000000000000000000000000000000000000000..53c4642b7f00609654e1ca14b2d3700ddf7bc268 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/fc4e526a-9aef-4cfd-84dd-87562502fabb.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bce052605a5401f3e58748c6cd07c249c768a90c898d0e6fc568be22d363b14e +size 136022 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/fc8ef88c-c79d-4880-a5f5-97d4abf6bcb7.lance b/.lancedb/nltk_chunking_BAAI.lance/data/fc8ef88c-c79d-4880-a5f5-97d4abf6bcb7.lance new file mode 100644 index 0000000000000000000000000000000000000000..f186a55841bb45b0cd2cf813ef4168d061613535 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/fc8ef88c-c79d-4880-a5f5-97d4abf6bcb7.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38cdec454ed2d2cd317337a776ff1cb86f9a4635284bc1d1ee13c2abf7977d6a +size 139042 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/fefa55d0-3607-494f-afcb-e67a14f08c48.lance b/.lancedb/nltk_chunking_BAAI.lance/data/fefa55d0-3607-494f-afcb-e67a14f08c48.lance new file mode 100644 index 0000000000000000000000000000000000000000..7496a3941f96baaa64af692397846e3322f79594 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/fefa55d0-3607-494f-afcb-e67a14f08c48.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac6160f70012ab79e1c32dbaa5892adaf049081c7173e53e0591a585f0b6f83c +size 136648 diff --git a/.lancedb/nltk_chunking_BAAI.lance/data/ff6378c1-db1b-4a78-bbe6-04f54f41e811.lance b/.lancedb/nltk_chunking_BAAI.lance/data/ff6378c1-db1b-4a78-bbe6-04f54f41e811.lance new file mode 100644 index 0000000000000000000000000000000000000000..3efdf53eb08b998b5a00f4416d104f8f50d27608 --- /dev/null +++ b/.lancedb/nltk_chunking_BAAI.lance/data/ff6378c1-db1b-4a78-bbe6-04f54f41e811.lance @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0aaa886fc3a17b1cb15c99e15bf3cee3ca20411ac5919f19bf7ea8d099d2fec9 +size 137921 diff --git a/chunked/content_aware_chunking/__config/chunk_1.txt b/chunked/content_aware_chunking/__config/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..570746a8d53645c1a5f75632488aa36c318f718f --- /dev/null +++ b/chunked/content_aware_chunking/__config/chunk_1.txt @@ -0,0 +1,2 @@ +pip install transformers datasets +To install from source instead of the last release, comment the command above and uncomment the following one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/__config/chunk_2.txt b/chunked/content_aware_chunking/__config/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..74e0f12e3246e5d0b556558359a30e0991092cdc --- /dev/null +++ b/chunked/content_aware_chunking/__config/chunk_2.txt @@ -0,0 +1 @@ +! \ No newline at end of file diff --git a/chunked/content_aware_chunking/__config/chunk_3.txt b/chunked/content_aware_chunking/__config/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1c2b553afc68a2cfc0a086df93969ea53775ed0 --- /dev/null +++ b/chunked/content_aware_chunking/__config/chunk_3.txt @@ -0,0 +1,8 @@ +pip install git+https://github.com/huggingface/transformers.git +""" +notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}] +black_avoid_patterns = { + "{processor_class}": "FakeProcessorClass", + "{model_class}": "FakeModelClass", + "{object_class}": "FakeObjectClass", +} \ No newline at end of file diff --git a/chunked/content_aware_chunking/__toctree/chunk_2.txt b/chunked/content_aware_chunking/__toctree/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..76209eea7470e02152e059347698b4328102f5ab --- /dev/null +++ b/chunked/content_aware_chunking/__toctree/chunk_2.txt @@ -0,0 +1,2 @@ +local: add_tensorflow_model + title: How to convert a 🤗 Transformers model to TensorFlow? \ No newline at end of file diff --git a/chunked/content_aware_chunking/__toctree/chunk_3.txt b/chunked/content_aware_chunking/__toctree/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ead8ca274b89f5fff27f905d7eb8d7c05c7b9aa --- /dev/null +++ b/chunked/content_aware_chunking/__toctree/chunk_3.txt @@ -0,0 +1,2 @@ +local: add_new_pipeline + title: How to add a pipeline to 🤗 Transformers? \ No newline at end of file diff --git a/chunked/content_aware_chunking/__toctree/chunk_4.txt b/chunked/content_aware_chunking/__toctree/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..7daf7c1d01b3104518e9a1ee6869ea21cf35d7d5 --- /dev/null +++ b/chunked/content_aware_chunking/__toctree/chunk_4.txt @@ -0,0 +1,637 @@ +local: testing + title: Testing +local: pr_checks + title: Checks on a Pull Request + title: Contribute +sections: +local: philosophy + title: Philosophy +local: glossary + title: Glossary +local: task_summary + title: What 🤗 Transformers can do +local: tasks_explained + title: How 🤗 Transformers solve tasks +local: model_summary + title: The Transformer model family +local: tokenizer_summary + title: Summary of the tokenizers +local: attention + title: Attention mechanisms +local: pad_truncation + title: Padding and truncation +local: bertology + title: BERTology +local: perplexity + title: Perplexity of fixed-length models +local: pipeline_webserver + title: Pipelines for webserver inference +local: model_memory_anatomy + title: Model training anatomy +local: llm_tutorial_optimization + title: Getting the most out of LLMs + title: Conceptual guides +sections: +sections: +local: main_classes/agent + title: Agents and Tools +local: model_doc/auto + title: Auto Classes +local: main_classes/backbones + title: Backbones +local: main_classes/callback + title: Callbacks +local: main_classes/configuration + title: Configuration +local: main_classes/data_collator + title: Data Collator +local: main_classes/keras_callbacks + title: Keras callbacks +local: main_classes/logging + title: Logging +local: main_classes/model + title: Models +local: main_classes/text_generation + title: Text Generation +local: main_classes/onnx + title: ONNX +local: main_classes/optimizer_schedules + title: Optimization +local: main_classes/output + title: Model outputs +local: main_classes/pipelines + title: Pipelines +local: main_classes/processors + title: Processors +local: main_classes/quantization + title: Quantization +local: main_classes/tokenizer + title: Tokenizer +local: main_classes/trainer + title: Trainer +local: main_classes/deepspeed + title: DeepSpeed +local: main_classes/feature_extractor + title: Feature Extractor +local: main_classes/image_processor + title: Image Processor +title: Main Classes + +sections: +isExpanded: false + sections: +local: model_doc/albert + title: ALBERT +local: model_doc/bart + title: BART +local: model_doc/barthez + title: BARThez +local: model_doc/bartpho + title: BARTpho +local: model_doc/bert + title: BERT +local: model_doc/bert-generation + title: BertGeneration +local: model_doc/bert-japanese + title: BertJapanese +local: model_doc/bertweet + title: Bertweet +local: model_doc/big_bird + title: BigBird +local: model_doc/bigbird_pegasus + title: BigBirdPegasus +local: model_doc/biogpt + title: BioGpt +local: model_doc/blenderbot + title: Blenderbot +local: model_doc/blenderbot-small + title: Blenderbot Small +local: model_doc/bloom + title: BLOOM +local: model_doc/bort + title: BORT +local: model_doc/byt5 + title: ByT5 +local: model_doc/camembert + title: CamemBERT +local: model_doc/canine + title: CANINE +local: model_doc/codegen + title: CodeGen +local: model_doc/code_llama + title: CodeLlama +local: model_doc/convbert + title: ConvBERT +local: model_doc/cpm + title: CPM +local: model_doc/cpmant + title: CPMANT +local: model_doc/ctrl + title: CTRL +local: model_doc/deberta + title: DeBERTa +local: model_doc/deberta-v2 + title: DeBERTa-v2 +local: model_doc/dialogpt + title: DialoGPT +local: model_doc/distilbert + title: DistilBERT +local: model_doc/dpr + title: DPR +local: model_doc/electra + title: ELECTRA +local: model_doc/encoder-decoder + title: Encoder Decoder Models +local: model_doc/ernie + title: ERNIE +local: model_doc/ernie_m + title: ErnieM +local: model_doc/esm + title: ESM +local: model_doc/falcon + title: Falcon +local: model_doc/fastspeech2_conformer + title: FastSpeech2Conformer +local: model_doc/flan-t5 + title: FLAN-T5 +local: model_doc/flan-ul2 + title: FLAN-UL2 +local: model_doc/flaubert + title: FlauBERT +local: model_doc/fnet + title: FNet +local: model_doc/fsmt + title: FSMT +local: model_doc/funnel + title: Funnel Transformer +local: model_doc/fuyu + title: Fuyu +local: model_doc/openai-gpt + title: GPT +local: model_doc/gpt_neo + title: GPT Neo +local: model_doc/gpt_neox + title: GPT NeoX +local: model_doc/gpt_neox_japanese + title: GPT NeoX Japanese +local: model_doc/gptj + title: GPT-J +local: model_doc/gpt2 + title: GPT2 +local: model_doc/gpt_bigcode + title: GPTBigCode +local: model_doc/gptsan-japanese + title: GPTSAN Japanese +local: model_doc/gpt-sw3 + title: GPTSw3 +local: model_doc/herbert + title: HerBERT +local: model_doc/ibert + title: I-BERT +local: model_doc/jukebox + title: Jukebox +local: model_doc/led + title: LED +local: model_doc/llama + title: LLaMA +local: model_doc/llama2 + title: Llama2 +local: model_doc/longformer + title: Longformer +local: model_doc/longt5 + title: LongT5 +local: model_doc/luke + title: LUKE +local: model_doc/m2m_100 + title: M2M100 +local: model_doc/madlad-400 + title: MADLAD-400 +local: model_doc/marian + title: MarianMT +local: model_doc/markuplm + title: MarkupLM +local: model_doc/mbart + title: MBart and MBart-50 +local: model_doc/mega + title: MEGA +local: model_doc/megatron-bert + title: MegatronBERT +local: model_doc/megatron_gpt2 + title: MegatronGPT2 +local: model_doc/mistral + title: Mistral +local: model_doc/mixtral + title: Mixtral +local: model_doc/mluke + title: mLUKE +local: model_doc/mobilebert + title: MobileBERT +local: model_doc/mpnet + title: MPNet +local: model_doc/mpt + title: MPT +local: model_doc/mra + title: MRA +local: model_doc/mt5 + title: MT5 +local: model_doc/mvp + title: MVP +local: model_doc/nezha + title: NEZHA +local: model_doc/nllb + title: NLLB +local: model_doc/nllb-moe + title: NLLB-MoE +local: model_doc/nystromformer + title: Nyströmformer +local: model_doc/open-llama + title: Open-Llama +local: model_doc/opt + title: OPT +local: model_doc/pegasus + title: Pegasus +local: model_doc/pegasus_x + title: PEGASUS-X +local: model_doc/persimmon + title: Persimmon +local: model_doc/phi + title: Phi +local: model_doc/phobert + title: PhoBERT +local: model_doc/plbart + title: PLBart +local: model_doc/prophetnet + title: ProphetNet +local: model_doc/qdqbert + title: QDQBert +local: model_doc/qwen2 + title: Qwen2 +local: model_doc/rag + title: RAG +local: model_doc/realm + title: REALM +local: model_doc/reformer + title: Reformer +local: model_doc/rembert + title: RemBERT +local: model_doc/retribert + title: RetriBERT +local: model_doc/roberta + title: RoBERTa +local: model_doc/roberta-prelayernorm + title: RoBERTa-PreLayerNorm +local: model_doc/roc_bert + title: RoCBert +local: model_doc/roformer + title: RoFormer +local: model_doc/rwkv + title: RWKV +local: model_doc/splinter + title: Splinter +local: model_doc/squeezebert + title: SqueezeBERT +local: model_doc/stablelm + title: StableLm +local: model_doc/switch_transformers + title: SwitchTransformers +local: model_doc/t5 + title: T5 +local: model_doc/t5v1.1 + title: T5v1.1 +local: model_doc/tapex + title: TAPEX +local: model_doc/transfo-xl + title: Transformer XL +local: model_doc/ul2 + title: UL2 +local: model_doc/umt5 + title: UMT5 +local: model_doc/xmod + title: X-MOD +local: model_doc/xglm + title: XGLM +local: model_doc/xlm + title: XLM +local: model_doc/xlm-prophetnet + title: XLM-ProphetNet +local: model_doc/xlm-roberta + title: XLM-RoBERTa +local: model_doc/xlm-roberta-xl + title: XLM-RoBERTa-XL +local: model_doc/xlm-v + title: XLM-V +local: model_doc/xlnet + title: XLNet +local: model_doc/yoso + title: YOSO + title: Text models +isExpanded: false + sections: +local: model_doc/beit + title: BEiT +local: model_doc/bit + title: BiT +local: model_doc/conditional_detr + title: Conditional DETR +local: model_doc/convnext + title: ConvNeXT +local: model_doc/convnextv2 + title: ConvNeXTV2 +local: model_doc/cvt + title: CvT +local: model_doc/deformable_detr + title: Deformable DETR +local: model_doc/deit + title: DeiT +local: model_doc/depth_anything + title: Depth Anything +local: model_doc/deta + title: DETA +local: model_doc/detr + title: DETR +local: model_doc/dinat + title: DiNAT +local: model_doc/dinov2 + title: DINOV2 +local: model_doc/dit + title: DiT +local: model_doc/dpt + title: DPT +local: model_doc/efficientformer + title: EfficientFormer +local: model_doc/efficientnet + title: EfficientNet +local: model_doc/focalnet + title: FocalNet +local: model_doc/glpn + title: GLPN +local: model_doc/imagegpt + title: ImageGPT +local: model_doc/levit + title: LeViT +local: model_doc/mask2former + title: Mask2Former +local: model_doc/maskformer + title: MaskFormer +local: model_doc/mobilenet_v1 + title: MobileNetV1 +local: model_doc/mobilenet_v2 + title: MobileNetV2 +local: model_doc/mobilevit + title: MobileViT +local: model_doc/mobilevitv2 + title: MobileViTV2 +local: model_doc/nat + title: NAT +local: model_doc/poolformer + title: PoolFormer +local: model_doc/pvt + title: Pyramid Vision Transformer (PVT) +local: model_doc/regnet + title: RegNet +local: model_doc/resnet + title: ResNet +local: model_doc/segformer + title: SegFormer +local: model_doc/swiftformer + title: SwiftFormer +local: model_doc/swin + title: Swin Transformer +local: model_doc/swinv2 + title: Swin Transformer V2 +local: model_doc/swin2sr + title: Swin2SR +local: model_doc/table-transformer + title: Table Transformer +local: model_doc/upernet + title: UperNet +local: model_doc/van + title: VAN +local: model_doc/vit + title: Vision Transformer (ViT) +local: model_doc/vit_hybrid + title: ViT Hybrid +local: model_doc/vitdet + title: ViTDet +local: model_doc/vit_mae + title: ViTMAE +local: model_doc/vitmatte + title: ViTMatte +local: model_doc/vit_msn + title: ViTMSN +local: model_doc/yolos + title: YOLOS + title: Vision models +isExpanded: false + sections: +local: model_doc/audio-spectrogram-transformer + title: Audio Spectrogram Transformer +local: model_doc/bark + title: Bark +local: model_doc/clap + title: CLAP +local: model_doc/encodec + title: EnCodec +local: model_doc/hubert + title: Hubert +local: model_doc/mctct + title: MCTCT +local: model_doc/mms + title: MMS +local: model_doc/musicgen + title: MusicGen +local: model_doc/pop2piano + title: Pop2Piano +local: model_doc/seamless_m4t + title: Seamless-M4T +local: model_doc/seamless_m4t_v2 + title: SeamlessM4T-v2 +local: model_doc/sew + title: SEW +local: model_doc/sew-d + title: SEW-D +local: model_doc/speech_to_text + title: Speech2Text +local: model_doc/speech_to_text_2 + title: Speech2Text2 +local: model_doc/speecht5 + title: SpeechT5 +local: model_doc/unispeech + title: UniSpeech +local: model_doc/unispeech-sat + title: UniSpeech-SAT +local: model_doc/univnet + title: UnivNet +local: model_doc/vits + title: VITS +local: model_doc/wav2vec2 + title: Wav2Vec2 +local: model_doc/wav2vec2-bert + title: Wav2Vec2-BERT +local: model_doc/wav2vec2-conformer + title: Wav2Vec2-Conformer +local: model_doc/wav2vec2_phoneme + title: Wav2Vec2Phoneme +local: model_doc/wavlm + title: WavLM +local: model_doc/whisper + title: Whisper +local: model_doc/xls_r + title: XLS-R +local: model_doc/xlsr_wav2vec2 + title: XLSR-Wav2Vec2 + title: Audio models +isExpanded: false + sections: +local: model_doc/timesformer + title: TimeSformer +local: model_doc/videomae + title: VideoMAE +local: model_doc/vivit + title: ViViT + title: Video models +isExpanded: false + sections: +local: model_doc/align + title: ALIGN +local: model_doc/altclip + title: AltCLIP +local: model_doc/blip + title: BLIP +local: model_doc/blip-2 + title: BLIP-2 +local: model_doc/bridgetower + title: BridgeTower +local: model_doc/bros + title: BROS +local: model_doc/chinese_clip + title: Chinese-CLIP +local: model_doc/clip + title: CLIP +local: model_doc/clipseg + title: CLIPSeg +local: model_doc/clvp + title: CLVP +local: model_doc/data2vec + title: Data2Vec +local: model_doc/deplot + title: DePlot +local: model_doc/donut + title: Donut +local: model_doc/flava + title: FLAVA +local: model_doc/git + title: GIT +local: model_doc/groupvit + title: GroupViT +local: model_doc/idefics + title: IDEFICS +local: model_doc/instructblip + title: InstructBLIP +local: model_doc/kosmos-2 + title: KOSMOS-2 +local: model_doc/layoutlm + title: LayoutLM +local: model_doc/layoutlmv2 + title: LayoutLMV2 +local: model_doc/layoutlmv3 + title: LayoutLMV3 +local: model_doc/layoutxlm + title: LayoutXLM +local: model_doc/lilt + title: LiLT +local: model_doc/llava + title: Llava +local: model_doc/lxmert + title: LXMERT +local: model_doc/matcha + title: MatCha +local: model_doc/mgp-str + title: MGP-STR +local: model_doc/nougat + title: Nougat +local: model_doc/oneformer + title: OneFormer +local: model_doc/owlvit + title: OWL-ViT +local: model_doc/owlv2 + title: OWLv2 +local: model_doc/perceiver + title: Perceiver +local: model_doc/pix2struct + title: Pix2Struct +local: model_doc/sam + title: Segment Anything +local: model_doc/siglip + title: SigLIP +local: model_doc/speech-encoder-decoder + title: Speech Encoder Decoder Models +local: model_doc/tapas + title: TAPAS +local: model_doc/trocr + title: TrOCR +local: model_doc/tvlt + title: TVLT +local: model_doc/tvp + title: TVP +local: model_doc/vilt + title: ViLT +local: model_doc/vipllava + title: VipLlava +local: model_doc/vision-encoder-decoder + title: Vision Encoder Decoder Models +local: model_doc/vision-text-dual-encoder + title: Vision Text Dual Encoder +local: model_doc/visual_bert + title: VisualBERT +local: model_doc/xclip + title: X-CLIP + title: Multimodal models +isExpanded: false + sections: +local: model_doc/decision_transformer + title: Decision Transformer +local: model_doc/trajectory_transformer + title: Trajectory Transformer + title: Reinforcement learning models +isExpanded: false + sections: +local: model_doc/autoformer + title: Autoformer +local: model_doc/informer + title: Informer +local: model_doc/patchtsmixer + title: PatchTSMixer +local: model_doc/patchtst + title: PatchTST +local: model_doc/time_series_transformer + title: Time Series Transformer + title: Time series models +isExpanded: false + sections: +local: model_doc/graphormer + title: Graphormer + title: Graph models +title: Models + +sections: +local: internal/modeling_utils + title: Custom Layers and Utilities +local: internal/pipelines_utils + title: Utilities for pipelines +local: internal/tokenization_utils + title: Utilities for Tokenizers +local: internal/trainer_utils + title: Utilities for Trainer +local: internal/generation_utils + title: Utilities for Generation +local: internal/image_processing_utils + title: Utilities for Image Processors +local: internal/audio_utils + title: Utilities for Audio processing +local: internal/file_utils + title: General Utilities +local: internal/time_series_utils + title: Utilities for Time Series +title: Internal Helpers + title: API \ No newline at end of file diff --git a/chunked/content_aware_chunking/_accelerate/chunk_10.txt b/chunked/content_aware_chunking/_accelerate/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6efca4cf2a17c212af67ae129dde6a33a97a2e --- /dev/null +++ b/chunked/content_aware_chunking/_accelerate/chunk_10.txt @@ -0,0 +1,6 @@ +Wrap all the code responsible for training in a function, and pass it to [~accelerate.notebook_launcher]: + +from accelerate import notebook_launcher +notebook_launcher(training_function) + +For more information about 🤗 Accelerate and its rich features, refer to the documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_accelerate/chunk_5.txt b/chunked/content_aware_chunking/_accelerate/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f9fb5c5715fdbf9352d1712e1f9dbc813db7f00 --- /dev/null +++ b/chunked/content_aware_chunking/_accelerate/chunk_5.txt @@ -0,0 +1 @@ +You don't need to explicitly place your model on a device. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_accelerate/chunk_6.txt b/chunked/content_aware_chunking/_accelerate/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..04fa7fa0141fd08fa1e9487f46d3a7ec743280dd --- /dev/null +++ b/chunked/content_aware_chunking/_accelerate/chunk_6.txt @@ -0,0 +1,5 @@ +from accelerate import Accelerator +accelerator = Accelerator() + +Prepare to accelerate +The next step is to pass all the relevant training objects to the [~accelerate.Accelerator.prepare] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_accelerate/chunk_7.txt b/chunked/content_aware_chunking/_accelerate/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a432a98f0f266e9b0c7de7b1c763a7e2fcdfa54 --- /dev/null +++ b/chunked/content_aware_chunking/_accelerate/chunk_7.txt @@ -0,0 +1,21 @@ +This includes your training and evaluation DataLoaders, a model and an optimizer: + +train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( + train_dataloader, eval_dataloader, model, optimizer + ) + +Backward +The last addition is to replace the typical loss.backward() in your training loop with 🤗 Accelerate's [~accelerate.Accelerator.backward]method: + +for epoch in range(num_epochs): + for batch in train_dataloader: + outputs = model(**batch) + loss = outputs.loss + accelerator.backward(loss) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) + +As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_accelerate/chunk_8.txt b/chunked/content_aware_chunking/_accelerate/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2e803beb10ba64f6e7fb943a92c083a19ef5d45 --- /dev/null +++ b/chunked/content_aware_chunking/_accelerate/chunk_8.txt @@ -0,0 +1,41 @@ ++ from accelerate import Accelerator + from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler + +accelerator = Accelerator() + +model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) + optimizer = AdamW(model.parameters(), lr=3e-5) + +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + +model.to(device) + +train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( + +train_dataloader, eval_dataloader, model, optimizer +) + +num_epochs = 3 + num_training_steps = num_epochs * len(train_dataloader) + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=num_training_steps + ) +progress_bar = tqdm(range(num_training_steps)) +model.train() + for epoch in range(num_epochs): + for batch in train_dataloader: + + outputs = model(**batch) + loss = outputs.loss + ++ accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) + +Train +Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_accelerate/chunk_9.txt b/chunked/content_aware_chunking/_accelerate/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..024946adeb7d0d458c018e88a696f4815529bdf9 --- /dev/null +++ b/chunked/content_aware_chunking/_accelerate/chunk_9.txt @@ -0,0 +1,9 @@ +Train with a script +If you are running your training from a script, run the following command to create and save a configuration file: + +accelerate config +Then launch your training with: + +accelerate launch train.py +Train with a notebook +🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_110.txt b/chunked/content_aware_chunking/_add_new_model/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..b67a551d1138294b0d84e29cd6116c07b40e734b --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_110.txt @@ -0,0 +1 @@ +Be able to locate the important components of the model: Where is the model's class? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_111.txt b/chunked/content_aware_chunking/_add_new_model/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a5d41b695bc239a77ddc52577f185c57276994b --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_111.txt @@ -0,0 +1,2 @@ +Are there model sub-classes, + e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_112.txt b/chunked/content_aware_chunking/_add_new_model/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..c574882cd1aafd3fe5f421634a973c1e33a931aa --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_112.txt @@ -0,0 +1 @@ +EncoderModel, DecoderModel? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_113.txt b/chunked/content_aware_chunking/_add_new_model/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..5361d6f5ba114cfa9b948c88057611ae2df1ea2e --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_113.txt @@ -0,0 +1 @@ +Where is the self-attention layer? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_114.txt b/chunked/content_aware_chunking/_add_new_model/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..871e84ed3d694c36ff31e0ffdcaedc9d5fa1f4a3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_114.txt @@ -0,0 +1,2 @@ +Are there multiple different attention layers, + e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_115.txt b/chunked/content_aware_chunking/_add_new_model/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..23423ead08535e4636ab6df855f2d8a8b70f928a --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_115.txt @@ -0,0 +1 @@ +self-attention, cross-attention? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_116.txt b/chunked/content_aware_chunking/_add_new_model/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3af70baefaf8435618f32fa2dd2700deef23be1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_116.txt @@ -0,0 +1 @@ +How can you debug the model in the original environment of the repo? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_117.txt b/chunked/content_aware_chunking/_add_new_model/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..dce2f06a68f446ebb73f81167a62abe9254e6ed9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_117.txt @@ -0,0 +1,2 @@ +Do you have to add print statements, can you + work with an interactive debugger like ipdb, or should you use an efficient IDE to debug the model, like PyCharm? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_118.txt b/chunked/content_aware_chunking/_add_new_model/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f861d6cd41513ef0c9b4932d4d3ec160dfe6fd3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_118.txt @@ -0,0 +1,2 @@ +It is very important that before you start the porting process, you can efficiently debug code in the original +repository! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_119.txt b/chunked/content_aware_chunking/_add_new_model/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..734ae332fd7d470e2adcb464d1533b0b8ad31000 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_119.txt @@ -0,0 +1,2 @@ +Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or +even a pull request in the original repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_120.txt b/chunked/content_aware_chunking/_add_new_model/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1e30488085cdd27dfd535e6c9c664ed7e1a6667 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_120.txt @@ -0,0 +1,2 @@ +The maintainers of this repository are most likely very happy about +someone looking into their code! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_121.txt b/chunked/content_aware_chunking/_add_new_model/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..429efe5b2c1b3db70b7653b11cd0ff4784976429 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_121.txt @@ -0,0 +1,2 @@ +At this point, it is really up to you which debugging environment and strategy you prefer to use to debug the original +model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_122.txt b/chunked/content_aware_chunking/_add_new_model/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..99c2da77d1c89b207e4d852bc24d80ef4e7e7262 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_122.txt @@ -0,0 +1,2 @@ +We strongly advise against setting up a costly GPU environment, but simply work on a CPU both when starting to +dive into the original repository and also when starting to write the 🤗 Transformers implementation of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_123.txt b/chunked/content_aware_chunking/_add_new_model/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f312ace4d2f11097b767c8c7f654a777ea8bb04 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_123.txt @@ -0,0 +1,3 @@ +Only +at the very end, when the model has already been successfully ported to 🤗 Transformers, one should verify that the +model also works as expected on GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_124.txt b/chunked/content_aware_chunking/_add_new_model/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..d45b521311c9dcc0f2d70d65a1e3f97d12efbfb0 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_124.txt @@ -0,0 +1,4 @@ +In general, there are two possible debugging environments for running the original model + +Jupyter notebooks / google colab +Local python scripts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_125.txt b/chunked/content_aware_chunking/_add_new_model/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..40572f1aa281c8a53a11884dca2c4072da9858e5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_125.txt @@ -0,0 +1,2 @@ +Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split +logical components from one another and to have faster debugging cycles as intermediate results can be stored. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_126.txt b/chunked/content_aware_chunking/_add_new_model/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..6097aaae0bbeff07f2f4e2a495a463307d164456 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_126.txt @@ -0,0 +1,3 @@ +Also, +notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging +Face team for help. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_127.txt b/chunked/content_aware_chunking/_add_new_model/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac104048482dad2c5e637005468e73f4eada07c0 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_127.txt @@ -0,0 +1 @@ +If you are familiar with Jupyter notebooks, we strongly recommend you work with them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_128.txt b/chunked/content_aware_chunking/_add_new_model/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa64202c1962fc817cfbd1693a7bccfcb78cd1c7 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_128.txt @@ -0,0 +1,3 @@ +The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend +some time adjusting to the new programming environment and you might not be able to use your known debugging tools +anymore, like ipdb. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_129.txt b/chunked/content_aware_chunking/_add_new_model/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d679f31e06b41415a8fceab2338ac35d42b10a2 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_129.txt @@ -0,0 +1,2 @@ +For each code-base, a good first step is always to load a small pretrained checkpoint and to be able to reproduce a +single forward pass using a dummy integer vector of input IDs as an input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_130.txt b/chunked/content_aware_chunking/_add_new_model/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b6315e657aa0e7d174658becba1485cd33f45c9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_130.txt @@ -0,0 +1,14 @@ +Such a script could look like this (in +pseudocode): +python +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids +original_output = model.predict(input_ids) +Next, regarding the debugging strategy, there are generally a few from which to choose from: + +Decompose the original model into many small testable components and run a forward pass on each of those for + verification +Decompose the original model only into the original tokenizer and the original model, run a forward pass on + those, and use intermediate print statements or breakpoints for verification + +Again, it is up to you which strategy to choose. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_131.txt b/chunked/content_aware_chunking/_add_new_model/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..9172337311e520da1ff041b14d4ab3d86e1f0dd4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_131.txt @@ -0,0 +1,2 @@ +Often, one or the other is advantageous depending on the original code +base. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_132.txt b/chunked/content_aware_chunking/_add_new_model/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdb31a8510c3887a258dbb2bed0cb83e92dd2cc2 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_132.txt @@ -0,0 +1 @@ +If the original code-base allows you to decompose the model into smaller sub-components, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_133.txt b/chunked/content_aware_chunking/_add_new_model/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..a00cb667ca1b87197a6ae6b37643f495838c63b1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_133.txt @@ -0,0 +1,2 @@ +if the original +code-base can easily be run in eager mode, it is usually worth the effort to do so. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_134.txt b/chunked/content_aware_chunking/_add_new_model/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..31972f1c34b2b7bf817538fc7e955399b7014d6b --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_134.txt @@ -0,0 +1,15 @@ +There are some important advantages +to taking the more difficult road in the beginning: + +at a later stage when comparing the original model to the Hugging Face implementation, you can verify automatically + for each component individually that the corresponding component of the 🤗 Transformers implementation matches instead + of relying on visual comparison via print statements +it can give you some rope to decompose the big problem of porting a model into smaller problems of just porting + individual components and thus structure your work better +separating the model into logical meaningful components will help you to get a better overview of the model's design + and thus to better understand the model +at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue + changing your code + +Lysandre's integration checks for ELECTRA +gives a nice example of how this can be done. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_135.txt b/chunked/content_aware_chunking/_add_new_model/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..76a97ad26251c8c48471920e80e88db24dfc8093 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_135.txt @@ -0,0 +1,2 @@ +However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode, +it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_136.txt b/chunked/content_aware_chunking/_add_new_model/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..a30cc7dba977d15c47c1bee69d80b9f9cae55507 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_136.txt @@ -0,0 +1,3 @@ +A good +example is T5's MeshTensorFlow library which is +very complex and does not offer a simple way to decompose the model into its sub-components. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_137.txt b/chunked/content_aware_chunking/_add_new_model/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..51cfc1aff38b3b626990b30b2618bb185b0ab8f2 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_137.txt @@ -0,0 +1,2 @@ +For such libraries, one +often relies on verifying print statements. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_138.txt b/chunked/content_aware_chunking/_add_new_model/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..1929816d1481648cc90db99a9967a8e0bcee1598 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_138.txt @@ -0,0 +1,2 @@ +No matter which strategy you choose, the recommended procedure is often the same that you should start to debug the +starting layers first and the ending layers last. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_139.txt b/chunked/content_aware_chunking/_add_new_model/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..3546ecda887f8a4254002ab5159ccef1b05fdd0c --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_139.txt @@ -0,0 +1,11 @@ +It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following +layers in the following order: + +Retrieve the input IDs passed to the model +Retrieve the word embeddings +Retrieve the input of the first Transformer layer +Retrieve the output of the first Transformer layer +Retrieve the output of the following n - 1 Transformer layers +Retrieve the output of the whole BrandNewBert Model + +Input IDs should thereby consists of an array of integers, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_140.txt b/chunked/content_aware_chunking/_add_new_model/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..80ba474f256798040afeb1e209c43f1c90633c75 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_140.txt @@ -0,0 +1,12 @@ +input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] +The outputs of the following layers often consist of multi-dimensional float arrays and can look like this: +[[ + [-0.1465, -0.6501, 0.1993, , 0.1451, 0.3430, 0.6024], + [-0.4417, -0.5920, 0.3450, , -0.3062, 0.6182, 0.7132], + [-0.5009, -0.7122, 0.4548, , -0.3662, 0.6091, 0.7648], + , + [-0.5613, -0.6332, 0.4324, , -0.3792, 0.7372, 0.9288], + [-0.5416, -0.6345, 0.4180, , -0.3564, 0.6992, 0.9191], + [-0.5334, -0.6403, 0.4271, , -0.3339, 0.6533, 0.8694]]], +We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original +model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_141.txt b/chunked/content_aware_chunking/_add_new_model/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..d05ae114a447fa17704ae40457562eb232be4ae6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_141.txt @@ -0,0 +1,2 @@ +Since it is normal that the exact same model written in different libraries can give a slightly different output +depending on the library framework, we accept an error tolerance of 1e-3 (0.001). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_142.txt b/chunked/content_aware_chunking/_add_new_model/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..bec06283f2c0efefc44ea5a744bc9b1ffc72ed9d --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_142.txt @@ -0,0 +1,2 @@ +It is not enough if the model gives +nearly the same output, they have to be almost identical. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_143.txt b/chunked/content_aware_chunking/_add_new_model/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdf8676769ff9fde61618dba9f0e72531667e4d6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_143.txt @@ -0,0 +1,4 @@ +Therefore, you will certainly compare the intermediate +outputs of the 🤗 Transformers version multiple times against the intermediate outputs of the original implementation of +brand_new_bert in which case an efficient debugging environment of the original repository is absolutely +important. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_144.txt b/chunked/content_aware_chunking/_add_new_model/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f000a98e77aa73e96a25a57cdfb044fe98a7ce7 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_144.txt @@ -0,0 +1 @@ +Here is some advice to make your debugging environment as efficient as possible. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_145.txt b/chunked/content_aware_chunking/_add_new_model/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..de3d0cb221a7d1a8f93b80b256c6a64d5f590c5c --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_145.txt @@ -0,0 +1 @@ +Find the best way of debugging intermediate results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_146.txt b/chunked/content_aware_chunking/_add_new_model/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a258eac99324b97465d3168b194b2be47cec15f --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_146.txt @@ -0,0 +1 @@ +Is the original repository written in PyTorch? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_147.txt b/chunked/content_aware_chunking/_add_new_model/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8eb04ea464b73d24f8fedc956972a09132e2205 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_147.txt @@ -0,0 +1,3 @@ +Then you should + probably take the time to write a longer script that decomposes the original model into smaller sub-components to + retrieve intermediate values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_148.txt b/chunked/content_aware_chunking/_add_new_model/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4b6579a6c102d69b4db6ca033403de022c2ab1c --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_148.txt @@ -0,0 +1 @@ +Is the original repository written in Tensorflow 1? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_149.txt b/chunked/content_aware_chunking/_add_new_model/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..866020c602b07a6d1d8c0b9fb7d1c193b68dc1fa --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_149.txt @@ -0,0 +1,3 @@ +Then you might have to rely on + TensorFlow print operations like tf.print to output + intermediate values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_150.txt b/chunked/content_aware_chunking/_add_new_model/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7dfb5e9aa4e9f1b527de361b72dc5effd797add --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_150.txt @@ -0,0 +1 @@ +Is the original repository written in Jax? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_151.txt b/chunked/content_aware_chunking/_add_new_model/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d3da63dbaee1a198a3d03163be3485936815582 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_151.txt @@ -0,0 +1,2 @@ +Then make sure that the model is not jitted when + running the forward pass, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_152.txt b/chunked/content_aware_chunking/_add_new_model/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..231ad7fbf98c8740bbbe6defa4c1b72133d64f6a --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_152.txt @@ -0,0 +1 @@ +check-out this link. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_153.txt b/chunked/content_aware_chunking/_add_new_model/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..87a716efa90892fe05edce5066c9fb9ad005ea6d --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_153.txt @@ -0,0 +1 @@ +Use the smallest pretrained checkpoint you can find. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_154.txt b/chunked/content_aware_chunking/_add_new_model/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a996b7ab55e3aae67bd7d43913a6c90a59d7ca9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_154.txt @@ -0,0 +1,2 @@ +The smaller the checkpoint, the faster your debug cycle + becomes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_155.txt b/chunked/content_aware_chunking/_add_new_model/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..9985f22e81ab9abd67ef64d6176fc5e8765b3024 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_155.txt @@ -0,0 +1 @@ +It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_156.txt b/chunked/content_aware_chunking/_add_new_model/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..b897e691ec07ef4806f676317103785b7e82d427 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_156.txt @@ -0,0 +1,4 @@ +In case only very large checkpoints are available, it might make more sense to create a dummy model in the new + environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version + of your model +Make sure you are using the easiest way of calling a forward pass in the original repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_157.txt b/chunked/content_aware_chunking/_add_new_model/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f56f5c037c741a2de8d9d127ccaa91a76eb5e43 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_157.txt @@ -0,0 +1,2 @@ +Ideally, you want to + find the function in the original repository that only calls a single forward pass, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_158.txt b/chunked/content_aware_chunking/_add_new_model/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..45d7ca6bf9bc53d7f938075671afe90cc671965a --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_158.txt @@ -0,0 +1,2 @@ +that is often called + predict, evaluate, forward or __call__. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_159.txt b/chunked/content_aware_chunking/_add_new_model/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..844706970d8c2eb5e30bc43f371426c41e8a51ee --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_159.txt @@ -0,0 +1,2 @@ +You don't want to debug a function that calls forward + multiple times, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_160.txt b/chunked/content_aware_chunking/_add_new_model/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..9695868891cb91604f99bdbf0a40b49537b2b38d --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_160.txt @@ -0,0 +1 @@ +to generate text, like autoregressive_sample, generate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_161.txt b/chunked/content_aware_chunking/_add_new_model/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..71b9c0426f33bde83b28835e64351678df9468ef --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_161.txt @@ -0,0 +1 @@ +Try to separate the tokenization from the model's forward pass. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_162.txt b/chunked/content_aware_chunking/_add_new_model/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6a6e4bf29cc79f604a8cd180600c8d581690968 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_162.txt @@ -0,0 +1,3 @@ +If the original repository shows examples where + you have to input a string, then try to find out where in the forward call the string input is changed to input ids + and start from this point. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_163.txt b/chunked/content_aware_chunking/_add_new_model/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..f98896b9b90f59f86d90e55ecd19aa6ec1a80084 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_163.txt @@ -0,0 +1,2 @@ +This might mean that you have to possibly write a small script yourself or change the + original code so that you can directly input the ids instead of an input string. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_164.txt b/chunked/content_aware_chunking/_add_new_model/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c807e04a9b251b1a977266680f0b86d774644c8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_164.txt @@ -0,0 +1,2 @@ +Make sure that the model in your debugging setup is not in training mode, which often causes the model to yield + random outputs due to multiple dropout layers in the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_165.txt b/chunked/content_aware_chunking/_add_new_model/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7c114205d6faa0b16e7419b81f7666540d59bde --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_165.txt @@ -0,0 +1,2 @@ +Make sure that the forward pass in your debugging + environment is deterministic so that the dropout layers are not used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_166.txt b/chunked/content_aware_chunking/_add_new_model/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..f50f1b2435214f6faf8192ee9267551124b732cb --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_166.txt @@ -0,0 +1,2 @@ +Or use transformers.utils.set_seed + if the old and new implementations are in the same framework. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_167.txt b/chunked/content_aware_chunking/_add_new_model/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..de14cb3db1f36162db6d8d42ba09f80eb8df5c5d --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_167.txt @@ -0,0 +1 @@ +The following section gives you more specific details/tips on how you can do this for brand_new_bert. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_168.txt b/chunked/content_aware_chunking/_add_new_model/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c54c94f375df096d1fd6cd50a3abd1216690f24 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_168.txt @@ -0,0 +1 @@ +5.-14. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_169.txt b/chunked/content_aware_chunking/_add_new_model/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..10b4306027fb57a9e03ab1897e71c50523ced723 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_169.txt @@ -0,0 +1,2 @@ +Port BrandNewBert to 🤗 Transformers +Next, you can finally start adding new code to 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_170.txt b/chunked/content_aware_chunking/_add_new_model/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fc465134e51f37734cb22745a2fd55395df50e8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_170.txt @@ -0,0 +1,5 @@ +Go into the clone of your 🤗 Transformers' fork: + +cd transformers +In the special case that you are adding a model whose architecture exactly matches the model architecture of an +existing model you only have to add a conversion script as described in this section. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_171.txt b/chunked/content_aware_chunking/_add_new_model/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..f74f296901ac384ac76d321723e05281cd4d17b6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_171.txt @@ -0,0 +1 @@ +In this case, you can just re-use the whole model architecture of the already existing model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_172.txt b/chunked/content_aware_chunking/_add_new_model/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..4837ec3aba8dc67c40da7bfdc2c51a18f8081cf4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_172.txt @@ -0,0 +1 @@ +Otherwise, let's start generating a new model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_173.txt b/chunked/content_aware_chunking/_add_new_model/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5e2b852e764ff3aef280032556dbe281d0ca8c8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_173.txt @@ -0,0 +1,6 @@ +You have two choices here: + +transformers-cli add-new-model-like to add a new model like an existing one +transformers-cli add-new-model to add a new model from our template (will look like BERT or Bart depending on the type of model you select) + +In both cases, you will be prompted with a questionnaire to fill in the basic information of your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_174.txt b/chunked/content_aware_chunking/_add_new_model/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..04a77d68ff56dfd0d5791e603385d60439c725f5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_174.txt @@ -0,0 +1 @@ +The second command requires to install cookiecutter, you can find more information on it here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_175.txt b/chunked/content_aware_chunking/_add_new_model/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..a146db85e7038d96bd2510988a3b1df99a46625d --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_175.txt @@ -0,0 +1,3 @@ +Open a Pull Request on the main huggingface/transformers repo +Before starting to adapt the automatically generated code, now is the time to open a “Work in progress (WIP)†pull +request, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_176.txt b/chunked/content_aware_chunking/_add_new_model/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..b372d2cb23773351495a28c6dc73c1be13c73b91 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_176.txt @@ -0,0 +1,2 @@ +“[WIP] Add brand_new_bertâ€, in 🤗 Transformers so that you and the Hugging Face team can work +side-by-side on integrating the model into 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_177.txt b/chunked/content_aware_chunking/_add_new_model/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..02257e24b80559b17fc34a889ae2aa091fbbef6b --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_177.txt @@ -0,0 +1,9 @@ +You should do the following: + +Create a branch with a descriptive name from your main branch + +git checkout -b add_brand_new_bert + +Commit the automatically generated code: + +git add . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_178.txt b/chunked/content_aware_chunking/_add_new_model/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..21b36c117ce33eea391c5042faf260c1456f3b85 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_178.txt @@ -0,0 +1,12 @@ +git commit + +Fetch and rebase to current main + +git fetch upstream +git rebase upstream/main + +Push the changes to your account using: + +git push -u origin a-descriptive-name-for-my-changes + +Once you are satisfied, go to the webpage of your fork on GitHub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_179.txt b/chunked/content_aware_chunking/_add_new_model/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..a75556052af8e2c70205a6902f1b28b7aa5eb5f5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_179.txt @@ -0,0 +1 @@ +Click on “Pull requestâ€. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_180.txt b/chunked/content_aware_chunking/_add_new_model/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d0b1892d3343828da4f6090d9709dfe62779eb1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_180.txt @@ -0,0 +1,3 @@ +Make sure to add the + GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for + future changes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_181.txt b/chunked/content_aware_chunking/_add_new_model/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..db3be93eee9eb5990852f4089e0fd533d85a1ef4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_181.txt @@ -0,0 +1 @@ +Change the PR into a draft by clicking on “Convert to draft†on the right of the GitHub pull request web page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_182.txt b/chunked/content_aware_chunking/_add_new_model/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..9abfb8a4904a437caf00b15fea1dd5663562bd54 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_182.txt @@ -0,0 +1,2 @@ +In the following, whenever you have made some progress, don't forget to commit your work and push it to your account so +that it shows in the pull request. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_183.txt b/chunked/content_aware_chunking/_add_new_model/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f27fc9e4e09680bd654a9f5ca4cbd625596f7bc --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_183.txt @@ -0,0 +1,7 @@ +Additionally, you should make sure to update your work with the current main from +time to time by doing: + +git fetch upstream +git merge upstream/main +In general, all questions you might have regarding the model or your implementation should be asked in your PR and +discussed/solved in the PR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_184.txt b/chunked/content_aware_chunking/_add_new_model/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..44eeceea4beec90f956f0fd3641d81160c1776ba --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_184.txt @@ -0,0 +1,2 @@ +This way, the Hugging Face team will always be notified when you are committing new code or +if you have a question. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_185.txt b/chunked/content_aware_chunking/_add_new_model/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..950fa71e9b699d0c474738d2dc7e7f84749dcedd --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_185.txt @@ -0,0 +1,2 @@ +It is often very helpful to point the Hugging Face team to your added code so that the Hugging +Face team can efficiently understand your problem or question. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_186.txt b/chunked/content_aware_chunking/_add_new_model/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..8664d336dfccbda80775842d4b5fc0df4eab71d8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_186.txt @@ -0,0 +1,2 @@ +To do so, you can go to the “Files changed†tab where you see all of your changes, go to a line regarding which you +want to ask a question, and click on the “+†symbol to add a comment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_187.txt b/chunked/content_aware_chunking/_add_new_model/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..47e37150749b1c31009d9d2c8348aede9584aa77 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_187.txt @@ -0,0 +1,2 @@ +Whenever a question or problem has been solved, +you can click on the “Resolve†button of the created comment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_188.txt b/chunked/content_aware_chunking/_add_new_model/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..281d6eade351291891424f2daf51e99f0bf92194 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_188.txt @@ -0,0 +1 @@ +In the same way, the Hugging Face team will open comments when reviewing your code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_189.txt b/chunked/content_aware_chunking/_add_new_model/chunk_189.txt new file mode 100644 index 0000000000000000000000000000000000000000..b130efda0cc87afa64f8a4f50daf230eaeb6819e --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_189.txt @@ -0,0 +1,2 @@ +We recommend asking most questions +on GitHub on your PR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_190.txt b/chunked/content_aware_chunking/_add_new_model/chunk_190.txt new file mode 100644 index 0000000000000000000000000000000000000000..b11eef8e9bcf08caa9c5dc9374dd39d8e7b0ed6f --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_190.txt @@ -0,0 +1,2 @@ +For some very general questions that are not very useful for the public, feel free to ping the +Hugging Face team by Slack or email. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_191.txt b/chunked/content_aware_chunking/_add_new_model/chunk_191.txt new file mode 100644 index 0000000000000000000000000000000000000000..91dcb6a87071975adc555db222107a0056de804e --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_191.txt @@ -0,0 +1 @@ +5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_192.txt b/chunked/content_aware_chunking/_add_new_model/chunk_192.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7bb0e35d357ff7428f648ed76c7f9fabafac981 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_192.txt @@ -0,0 +1,2 @@ +Adapt the generated models code for brand_new_bert +At first, we will focus only on the model itself and not care about the tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_193.txt b/chunked/content_aware_chunking/_add_new_model/chunk_193.txt new file mode 100644 index 0000000000000000000000000000000000000000..728e564f4b17598ff3116f059ddb81c8dcedd4c4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_193.txt @@ -0,0 +1,3 @@ +All the relevant code should be +found in the generated files src/transformers/models/brand_new_bert/modeling_brand_new_bert.py and +src/transformers/models/brand_new_bert/configuration_brand_new_bert.py. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_194.txt b/chunked/content_aware_chunking/_add_new_model/chunk_194.txt new file mode 100644 index 0000000000000000000000000000000000000000..30fbb01f0b4cbad29eb2f4bfd1bbf0514eb1ac15 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_194.txt @@ -0,0 +1 @@ +Now you can finally start coding :). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_195.txt b/chunked/content_aware_chunking/_add_new_model/chunk_195.txt new file mode 100644 index 0000000000000000000000000000000000000000..4667b224597124cc51b5d50477bcc0ee04100f22 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_195.txt @@ -0,0 +1,3 @@ +The generated code in +src/transformers/models/brand_new_bert/modeling_brand_new_bert.py will either have the same architecture as BERT if +it's an encoder-only model or BART if it's an encoder-decoder model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_196.txt b/chunked/content_aware_chunking/_add_new_model/chunk_196.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2d7256d924a69b860ca7c5dc780624b85f9d4d3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_196.txt @@ -0,0 +1,3 @@ +At this point, you should remind yourself what +you've learned in the beginning about the theoretical aspects of the model: How is the model different from BERT or +BART?". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_197.txt b/chunked/content_aware_chunking/_add_new_model/chunk_197.txt new file mode 100644 index 0000000000000000000000000000000000000000..0778f79dad9d94f3d3e466109f6d8a5825427144 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_197.txt @@ -0,0 +1,3 @@ +Implement those changes which often means changing the self-attention layer, the order of the normalization +layer, etc… Again, it is often useful to look at the similar architecture of already existing models in Transformers to +get a better feeling of how your model should be implemented. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_198.txt b/chunked/content_aware_chunking/_add_new_model/chunk_198.txt new file mode 100644 index 0000000000000000000000000000000000000000..617dc6ef7b385b3d644d80c28af289cb31218bf6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_198.txt @@ -0,0 +1 @@ +Note that at this point, you don't have to be very sure that your code is fully correct or clean. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_199.txt b/chunked/content_aware_chunking/_add_new_model/chunk_199.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cf72fd0f3604e71f8f851b9cefa4797688f86e1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_199.txt @@ -0,0 +1,4 @@ +Rather, it is +advised to add a first unclean, copy-pasted version of the original code to +src/transformers/models/brand_new_bert/modeling_brand_new_bert.py until you feel like all the necessary code is +added. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_200.txt b/chunked/content_aware_chunking/_add_new_model/chunk_200.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b308849d03dd1265283af2b259ec9f78bef9d73 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_200.txt @@ -0,0 +1,2 @@ +From our experience, it is much more efficient to quickly add a first version of the required code and +improve/correct the code iteratively with the conversion script as described in the next section. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_201.txt b/chunked/content_aware_chunking/_add_new_model/chunk_201.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9a28f708d30cd6d3eefb688b38c21a1207018a1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_201.txt @@ -0,0 +1,2 @@ +The only thing that +has to work at this point is that you can instantiate the 🤗 Transformers implementation of brand_new_bert, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_202.txt b/chunked/content_aware_chunking/_add_new_model/chunk_202.txt new file mode 100644 index 0000000000000000000000000000000000000000..2da1b1f213b3326df566fc4b1e6e7f775c1e6d53 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_202.txt @@ -0,0 +1,8 @@ +the +following command should work: +thon +from transformers import BrandNewBertModel, BrandNewBertConfig +model = BrandNewBertModel(BrandNewBertConfig()) + +The above command will create a model according to the default parameters as defined in BrandNewBertConfig() with +random weights, thus making sure that the init() methods of all components works. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_203.txt b/chunked/content_aware_chunking/_add_new_model/chunk_203.txt new file mode 100644 index 0000000000000000000000000000000000000000..68a96ce1ece4eec915575bcda96321a7e4283f80 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_203.txt @@ -0,0 +1,2 @@ +Note that all random initialization should happen in the _init_weights method of your BrandnewBertPreTrainedModel +class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_204.txt b/chunked/content_aware_chunking/_add_new_model/chunk_204.txt new file mode 100644 index 0000000000000000000000000000000000000000..a70cb3780ea5cd1dd32e6382d01eac3982aeccb5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_204.txt @@ -0,0 +1 @@ +It should initialize all leaf modules depending on the variables of the config. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_205.txt b/chunked/content_aware_chunking/_add_new_model/chunk_205.txt new file mode 100644 index 0000000000000000000000000000000000000000..105386cfe99da3b416269e4c41c34e53d1d45d70 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_205.txt @@ -0,0 +1,17 @@ +Here is an example with the +BERT _init_weights method: +py +def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) +You can have some more custom schemes if you need a special initialization for some modules. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_206.txt b/chunked/content_aware_chunking/_add_new_model/chunk_206.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7dc83ef1106badd6452f0ea0eea91b8e1b4d05d --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_206.txt @@ -0,0 +1,3 @@ +For instance, in +Wav2Vec2ForPreTraining, the last two linear layers need to have the initialization of the regular PyTorch nn.Linear +but all the other ones should use an initialization as above. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_207.txt b/chunked/content_aware_chunking/_add_new_model/chunk_207.txt new file mode 100644 index 0000000000000000000000000000000000000000..90d95220ca5265b1cdab721d81f83952a39fcc72 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_207.txt @@ -0,0 +1,14 @@ +This is coded like this: +py +def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, Wav2Vec2ForPreTraining): + module.project_hid.reset_parameters() + module.project_q.reset_parameters() + module.project_hid._is_hf_initialized = True + module.project_q._is_hf_initialized = True + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() +The _is_hf_initialized flag is internally used to make sure we only initialize a submodule once. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_208.txt b/chunked/content_aware_chunking/_add_new_model/chunk_208.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb53869a85b6488d6874976a894da456ee646d74 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_208.txt @@ -0,0 +1,3 @@ +By setting it to +True for module.project_q and module.project_hid, we make sure the custom initialization we did is not overridden later on, +the _init_weights function won't be applied to them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_209.txt b/chunked/content_aware_chunking/_add_new_model/chunk_209.txt new file mode 100644 index 0000000000000000000000000000000000000000..66aa82048927681af984d35e45d694eaf46a34f9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_209.txt @@ -0,0 +1 @@ +6. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_210.txt b/chunked/content_aware_chunking/_add_new_model/chunk_210.txt new file mode 100644 index 0000000000000000000000000000000000000000..66184a1f2b149a440faedfacc9ab4e68ba46b1c8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_210.txt @@ -0,0 +1,4 @@ +Write a conversion script +Next, you should write a conversion script that lets you convert the checkpoint you used to debug brand_new_bert in +the original repository to a checkpoint compatible with your just created 🤗 Transformers implementation of +brand_new_bert. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_211.txt b/chunked/content_aware_chunking/_add_new_model/chunk_211.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9c6a46c536f913af70da90c6f274d1b0ffadeb3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_211.txt @@ -0,0 +1,3 @@ +It is not advised to write the conversion script from scratch, but rather to look through already +existing conversion scripts in 🤗 Transformers for one that has been used to convert a similar model that was written in +the same framework as brand_new_bert. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_212.txt b/chunked/content_aware_chunking/_add_new_model/chunk_212.txt new file mode 100644 index 0000000000000000000000000000000000000000..9472cd92950ede7f9cdb09f03f61c4481a5c3224 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_212.txt @@ -0,0 +1,2 @@ +Usually, it is enough to copy an already existing conversion script and +slightly adapt it for your use case. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_213.txt b/chunked/content_aware_chunking/_add_new_model/chunk_213.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcff5a6e408ccbf64cba9a98e910107c483948d6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_213.txt @@ -0,0 +1,2 @@ +Don't hesitate to ask the Hugging Face team to point you to a similar already +existing conversion script for your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_214.txt b/chunked/content_aware_chunking/_add_new_model/chunk_214.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3f00cb2fa43191427b0c935c9c99b85e26d5289 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_214.txt @@ -0,0 +1,4 @@ +If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script here +If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script here + +In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_215.txt b/chunked/content_aware_chunking/_add_new_model/chunk_215.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6e680681af74cecac09590c9a37a88d4191ce4b --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_215.txt @@ -0,0 +1,2 @@ +In PyTorch, the +name of a layer is defined by the name of the class attribute you give the layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_216.txt b/chunked/content_aware_chunking/_add_new_model/chunk_216.txt new file mode 100644 index 0000000000000000000000000000000000000000..32f30eba53fad9c9f72e307f6c218f65f1778457 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_216.txt @@ -0,0 +1,13 @@ +Let's define a dummy model in +PyTorch, called SimpleModel as follows: +thon +from torch import nn +class SimpleModel(nn.Module): + def init(self): + super().init() + self.dense = nn.Linear(10, 10) + self.intermediate = nn.Linear(10, 10) + self.layer_norm = nn.LayerNorm(10) + +Now we can create an instance of this model definition which will fill all weights: dense, intermediate, +layer_norm with random weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_217.txt b/chunked/content_aware_chunking/_add_new_model/chunk_217.txt new file mode 100644 index 0000000000000000000000000000000000000000..00e8e5ad7d2169d975fc0b7edcb6fb1847827101 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_217.txt @@ -0,0 +1,12 @@ +We can print the model to see its architecture +thon +model = SimpleModel() +print(model) + +This will print out the following: +SimpleModel( + (dense): Linear(in_features=10, out_features=10, bias=True) + (intermediate): Linear(in_features=10, out_features=10, bias=True) + (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True) +) +We can see that the layer names are defined by the name of the class attribute in PyTorch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_218.txt b/chunked/content_aware_chunking/_add_new_model/chunk_218.txt new file mode 100644 index 0000000000000000000000000000000000000000..94071fef4985299b4647dbf0859ed3a3d638d398 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_218.txt @@ -0,0 +1,25 @@ +You can print out the weight +values of a specific layer: +python +print(model.dense.weight.data) +to see that the weights were randomly initialized +tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212, + -0.2077, 0.2157], + [ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190, + 0.2166, -0.0212], + [-0.2000, 0.1107, -0.1999, -0.3119, 0.1559, 0.0993, 0.1776, -0.1950, + -0.1023, -0.0447], + [-0.0888, -0.1092, 0.2281, 0.0336, 0.1817, -0.0115, 0.2096, 0.1415, + -0.1876, -0.2467], + [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465, + 0.2577, 0.0402], + [ 0.1502, 0.2465, 0.2566, 0.0693, 0.2352, -0.0530, 0.1859, -0.0604, + 0.2132, 0.1680], + [ 0.1733, -0.2407, -0.1721, 0.1484, 0.0358, -0.0633, -0.0721, -0.0090, + 0.2707, -0.2509], + [-0.1173, 0.1561, 0.2945, 0.0595, -0.1996, 0.2988, -0.0802, 0.0407, + 0.1829, -0.1568], + [-0.1164, -0.2228, -0.0403, 0.0428, 0.1339, 0.0047, 0.1967, 0.2923, + 0.0333, -0.0536], + [-0.1492, -0.1616, 0.1057, 0.1950, -0.2807, -0.2710, -0.1586, 0.0739, + 0.2220, 0.2358]]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_219.txt b/chunked/content_aware_chunking/_add_new_model/chunk_219.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc28efe5bffc01cd87b14b1b1c6e1dfd6574a981 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_219.txt @@ -0,0 +1,2 @@ +In the conversion script, you should fill those randomly initialized weights with the exact weights of the +corresponding layer in the checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_220.txt b/chunked/content_aware_chunking/_add_new_model/chunk_220.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c6c42b6981712ccfb35860e2422a687ff0e1372 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_220.txt @@ -0,0 +1 @@ +E.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_221.txt b/chunked/content_aware_chunking/_add_new_model/chunk_221.txt new file mode 100644 index 0000000000000000000000000000000000000000..07093513e6cd51b9ccc61554ba263ca33797e8dc --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_221.txt @@ -0,0 +1,2 @@ +thon +retrieve matching layer weights, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_222.txt b/chunked/content_aware_chunking/_add_new_model/chunk_222.txt new file mode 100644 index 0000000000000000000000000000000000000000..63211ccfda5787aed13bfdaf7c96807fc943d8a1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_222.txt @@ -0,0 +1,9 @@ +by +recursive algorithm +layer_name = "dense" +pretrained_weight = array_of_dense_layer +model_pointer = getattr(model, "dense") +model_pointer.weight.data = torch.from_numpy(pretrained_weight) + +While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding +pretrained checkpoint weight exactly match in both shape and name. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_223.txt b/chunked/content_aware_chunking/_add_new_model/chunk_223.txt new file mode 100644 index 0000000000000000000000000000000000000000..c89542736b3ecc485ddee8779e04dbdec5f9545c --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_223.txt @@ -0,0 +1,2 @@ +To do so, it is necessary to add assert +statements for the shape and print out the names of the checkpoints weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_224.txt b/chunked/content_aware_chunking/_add_new_model/chunk_224.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c6c42b6981712ccfb35860e2422a687ff0e1372 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_224.txt @@ -0,0 +1 @@ +E.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_225.txt b/chunked/content_aware_chunking/_add_new_model/chunk_225.txt new file mode 100644 index 0000000000000000000000000000000000000000..55c0df20150fc04c24743c36de8eac76cc884e32 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_225.txt @@ -0,0 +1,6 @@ +you should add statements like: +python +assert ( + model_pointer.weight.shape == pretrained_weight.shape +), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched" +Besides, you should also print out the names of both weights to make sure they match, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_226.txt b/chunked/content_aware_chunking/_add_new_model/chunk_226.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0525810cff9a46064db54533ffffb344234c595 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_226.txt @@ -0,0 +1,4 @@ +python +logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") +If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly +initialized layer of the 🤗 Transformers implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_227.txt b/chunked/content_aware_chunking/_add_new_model/chunk_227.txt new file mode 100644 index 0000000000000000000000000000000000000000..239fd53c4195dc7925e785224a054f5e0927abcf --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_227.txt @@ -0,0 +1,2 @@ +An incorrect shape is most likely due to an incorrect setting of the config parameters in BrandNewBertConfig() that +do not exactly match those that were used for the checkpoint you want to convert. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_228.txt b/chunked/content_aware_chunking/_add_new_model/chunk_228.txt new file mode 100644 index 0000000000000000000000000000000000000000..a25fa1acd6ff3dfb795bde42d25a7e0dcf8fbe38 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_228.txt @@ -0,0 +1,2 @@ +However, it could also be that +PyTorch's implementation of a layer requires the weight to be transposed beforehand. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_229.txt b/chunked/content_aware_chunking/_add_new_model/chunk_229.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b923fa7e444c57a02ad5aba2ff3746b4250e98d --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_229.txt @@ -0,0 +1,2 @@ +Finally, you should also check that all required weights are initialized and print out all checkpoint weights that +were not used for initialization to make sure the model is correctly converted. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_230.txt b/chunked/content_aware_chunking/_add_new_model/chunk_230.txt new file mode 100644 index 0000000000000000000000000000000000000000..07934c396742e731ba5ce4f0e4b96998da8ea414 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_230.txt @@ -0,0 +1,2 @@ +It is completely normal, that the +conversion trials fail with either a wrong shape statement or a wrong name assignment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_231.txt b/chunked/content_aware_chunking/_add_new_model/chunk_231.txt new file mode 100644 index 0000000000000000000000000000000000000000..08dc7f4d3c8aa0e47854abddf3970110616a9edf --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_231.txt @@ -0,0 +1,4 @@ +This is most likely because either +you used incorrect parameters in BrandNewBertConfig(), have a wrong architecture in the 🤗 Transformers +implementation, you have a bug in the init() functions of one of the components of the 🤗 Transformers +implementation or you need to transpose one of the checkpoint weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_232.txt b/chunked/content_aware_chunking/_add_new_model/chunk_232.txt new file mode 100644 index 0000000000000000000000000000000000000000..b01c473ec4cb774538a3b9567d4affa20d7427af --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_232.txt @@ -0,0 +1,2 @@ +This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the +Transformers model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_233.txt b/chunked/content_aware_chunking/_add_new_model/chunk_233.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c5084595221d56a964d52bee3671e5d78692ad1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_233.txt @@ -0,0 +1,6 @@ +Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save +the model under a folder of your choice /path/to/converted/checkpoint/folder that should then contain both a +pytorch_model.bin file and a config.json file: +python +model.save_pretrained("/path/to/converted/checkpoint/folder") +7. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_234.txt b/chunked/content_aware_chunking/_add_new_model/chunk_234.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa3f326725f776a9312de4d3bb89dea88fb53a3d --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_234.txt @@ -0,0 +1,3 @@ +Implement the forward pass +Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make +sure that the forward pass is correctly implemented. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_235.txt b/chunked/content_aware_chunking/_add_new_model/chunk_235.txt new file mode 100644 index 0000000000000000000000000000000000000000..30e23c47db71ffa3cbe1f699221cc3771c6bd86a --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_235.txt @@ -0,0 +1,2 @@ +In Get familiar with the original repository, you have already created a script that runs a forward +pass of the model using the original repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_236.txt b/chunked/content_aware_chunking/_add_new_model/chunk_236.txt new file mode 100644 index 0000000000000000000000000000000000000000..ddcfd76da9ae3b63cc3e2363cee441daf70bdec8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_236.txt @@ -0,0 +1,2 @@ +Now you should write an analogous script using the 🤗 Transformers +implementation instead of the original one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_237.txt b/chunked/content_aware_chunking/_add_new_model/chunk_237.txt new file mode 100644 index 0000000000000000000000000000000000000000..626918bd12377040030d8f280f8a63d567afdf64 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_237.txt @@ -0,0 +1,7 @@ +It should look as follows: +python +model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder") +input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] +output = model(input_ids).last_hidden_states +It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact +same output the very first time or that the forward pass throws an error. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_238.txt b/chunked/content_aware_chunking/_add_new_model/chunk_238.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a5b19e4446f03d78d1d24f6c0a8d7ce0cbabef6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_238.txt @@ -0,0 +1 @@ +Don't be disappointed - it's expected! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_239.txt b/chunked/content_aware_chunking/_add_new_model/chunk_239.txt new file mode 100644 index 0000000000000000000000000000000000000000..abf8cf50903c53c5677e859943d5aaef63a24356 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_239.txt @@ -0,0 +1,2 @@ +First, +you should make sure that the forward pass doesn't throw any errors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_240.txt b/chunked/content_aware_chunking/_add_new_model/chunk_240.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fa295c30bb417769419298cb35f47ddc0cdca2a --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_240.txt @@ -0,0 +1,2 @@ +It often happens that the wrong dimensions are +used leading to a Dimensionality mismatch error or that the wrong data type object is used, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_241.txt b/chunked/content_aware_chunking/_add_new_model/chunk_241.txt new file mode 100644 index 0000000000000000000000000000000000000000..909a3ad17a6696f800f788d1f4bf60790d864c8d --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_241.txt @@ -0,0 +1,2 @@ +torch.long +instead of torch.float32. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_242.txt b/chunked/content_aware_chunking/_add_new_model/chunk_242.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca61e0a7c738258c9a08c63dfdc41cd6032974c6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_242.txt @@ -0,0 +1,2 @@ +Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve +certain errors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_243.txt b/chunked/content_aware_chunking/_add_new_model/chunk_243.txt new file mode 100644 index 0000000000000000000000000000000000000000..01faca3dd0f8ae00d92dab7c222332d85a54cea4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_243.txt @@ -0,0 +1,2 @@ +The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are +equivalent to a precision of 1e-3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_244.txt b/chunked/content_aware_chunking/_add_new_model/chunk_244.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6effcba5353b6ec2d32d9fbf93cf2347c7ee348 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_244.txt @@ -0,0 +1 @@ +First, you should ensure that the output shapes are identical, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_245.txt b/chunked/content_aware_chunking/_add_new_model/chunk_245.txt new file mode 100644 index 0000000000000000000000000000000000000000..1aa05911a4fd18adf174881ea09eb414bbb47db1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_245.txt @@ -0,0 +1,2 @@ +outputs.shape should yield the same value for the script of the 🤗 Transformers implementation and the original +implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_246.txt b/chunked/content_aware_chunking/_add_new_model/chunk_246.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dd7297d1308a98e0deb369bea95da28cef8105b --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_246.txt @@ -0,0 +1 @@ +Next, you should make sure that the output values are identical as well. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_247.txt b/chunked/content_aware_chunking/_add_new_model/chunk_247.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d3fc40c38cc699537526425d8ece302de2bdc2f --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_247.txt @@ -0,0 +1,2 @@ +This one of the most difficult +parts of adding a new model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_248.txt b/chunked/content_aware_chunking/_add_new_model/chunk_248.txt new file mode 100644 index 0000000000000000000000000000000000000000..abb27973d4ec6bab62185d830f1c469080719947 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_248.txt @@ -0,0 +1,3 @@ +Common mistakes why the outputs are not identical are: + +Some layers were not added, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_249.txt b/chunked/content_aware_chunking/_add_new_model/chunk_249.txt new file mode 100644 index 0000000000000000000000000000000000000000..29846151180ddf99eb15759c9a74c3054dc75eeb --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_249.txt @@ -0,0 +1,4 @@ +an activation layer was not added, or the residual connection was forgotten +The word embedding matrix was not tied +The wrong positional embeddings are used because the original implementation uses on offset +Dropout is applied during the forward pass. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_250.txt b/chunked/content_aware_chunking/_add_new_model/chunk_250.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3484284d0e30ca377dd87fa3b3baf6e8d344af5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_250.txt @@ -0,0 +1,2 @@ +To fix this make sure model.training is False and that no dropout + layer is falsely activated during the forward pass, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_251.txt b/chunked/content_aware_chunking/_add_new_model/chunk_251.txt new file mode 100644 index 0000000000000000000000000000000000000000..01456c7daad170ab8b72568d132f471484aba0a7 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_251.txt @@ -0,0 +1,4 @@ +pass self.training to PyTorch's functional dropout + +The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗 +Transformers implementation side-by-side and check if there are any differences. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_252.txt b/chunked/content_aware_chunking/_add_new_model/chunk_252.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c7cb793391c36426086177da50c7bad82902740 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_252.txt @@ -0,0 +1,3 @@ +Ideally, you should debug/print out +intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗 +Transformers implementation shows a different output than the original implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_253.txt b/chunked/content_aware_chunking/_add_new_model/chunk_253.txt new file mode 100644 index 0000000000000000000000000000000000000000..240bffb8d12a174eab6952c18dd5126e21328a28 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_253.txt @@ -0,0 +1,2 @@ +First, make sure that the +hard-coded input_ids in both scripts are identical. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_254.txt b/chunked/content_aware_chunking/_add_new_model/chunk_254.txt new file mode 100644 index 0000000000000000000000000000000000000000..efbaf82259a73baeba7d288eb2ca5f91559c7e2f --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_254.txt @@ -0,0 +1,2 @@ +Next, verify that the outputs of the first transformation of +the input_ids (usually the word embeddings) are identical. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_255.txt b/chunked/content_aware_chunking/_add_new_model/chunk_255.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b398fd24dcfd95a3381d652992afaafd215fbe2 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_255.txt @@ -0,0 +1,2 @@ +And then work your way up to the very last layer of the +network. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_256.txt b/chunked/content_aware_chunking/_add_new_model/chunk_256.txt new file mode 100644 index 0000000000000000000000000000000000000000..19018ea36b94c0a8501964721285644e04282dae --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_256.txt @@ -0,0 +1,2 @@ +At some point, you will notice a difference between the two implementations, which should point you to the bug +in the 🤗 Transformers implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_257.txt b/chunked/content_aware_chunking/_add_new_model/chunk_257.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd06bea2758467d734e97aba537ee3e5079458b4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_257.txt @@ -0,0 +1,3 @@ +From our experience, a simple and efficient way is to add many print statements +in both the original implementation and 🤗 Transformers implementation, at the same positions in the network +respectively, and to successively remove print statements showing the same values for intermediate presentations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_258.txt b/chunked/content_aware_chunking/_add_new_model/chunk_258.txt new file mode 100644 index 0000000000000000000000000000000000000000..01b5b4641aba2947f674d56c91f2b70a1153ca43 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_258.txt @@ -0,0 +1,2 @@ +When you're confident that both implementations yield the same output, verify the outputs with +torch.allclose(original_output, output, atol=1e-3), you're done with the most difficult part! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_259.txt b/chunked/content_aware_chunking/_add_new_model/chunk_259.txt new file mode 100644 index 0000000000000000000000000000000000000000..525319d6662df773bc7d83e1c76fa3cb2c72c0bf --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_259.txt @@ -0,0 +1,2 @@ +Congratulations - the +work left to be done should be a cakewalk 😊. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_260.txt b/chunked/content_aware_chunking/_add_new_model/chunk_260.txt new file mode 100644 index 0000000000000000000000000000000000000000..e470f6393cd8e515f0e678f5d32a87898f8ca97e --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_260.txt @@ -0,0 +1 @@ +8. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_261.txt b/chunked/content_aware_chunking/_add_new_model/chunk_261.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca0f2002b467b5c73629e70c7458d33dbe6fc65c --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_261.txt @@ -0,0 +1,2 @@ +Adding all necessary model tests +At this point, you have successfully added a new model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_262.txt b/chunked/content_aware_chunking/_add_new_model/chunk_262.txt new file mode 100644 index 0000000000000000000000000000000000000000..c61ab0c88f1db780d397ef1833d9ae31d5468827 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_262.txt @@ -0,0 +1,2 @@ +However, it is very much possible that the model does not yet +fully comply with the required design. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_263.txt b/chunked/content_aware_chunking/_add_new_model/chunk_263.txt new file mode 100644 index 0000000000000000000000000000000000000000..becb1fb07e23b05d36963bd01b6680686677c37f --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_263.txt @@ -0,0 +1,2 @@ +To make sure, the implementation is fully compatible with 🤗 Transformers, all +common tests should pass. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_264.txt b/chunked/content_aware_chunking/_add_new_model/chunk_264.txt new file mode 100644 index 0000000000000000000000000000000000000000..11583083a6d9f1a14a7133a06481b9c5360a35b1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_264.txt @@ -0,0 +1,2 @@ +The Cookiecutter should have automatically added a test file for your model, probably under +the same tests/models/brand_new_bert/test_modeling_brand_new_bert.py. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_265.txt b/chunked/content_aware_chunking/_add_new_model/chunk_265.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ff7d4b10d2eb0512e3ea3f607d2dd52a4138bc8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_265.txt @@ -0,0 +1,8 @@ +Run this test file to verify that all common +tests pass: + +pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py +Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that + +a) The community can easily understand your work by looking at specific tests of brand_new_bert +b) Future changes to your model will not break any important feature of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_266.txt b/chunked/content_aware_chunking/_add_new_model/chunk_266.txt new file mode 100644 index 0000000000000000000000000000000000000000..99a3c962bf6d1a694632c80be6dcccf277f718b7 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_266.txt @@ -0,0 +1 @@ +At first, integration tests should be added. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_267.txt b/chunked/content_aware_chunking/_add_new_model/chunk_267.txt new file mode 100644 index 0000000000000000000000000000000000000000..f650680cf8dbc7f8699625cdc2f111ecc7de1e1f --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_267.txt @@ -0,0 +1,2 @@ +Those integration tests essentially do the same as the debugging scripts +you used earlier to implement the model to 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_268.txt b/chunked/content_aware_chunking/_add_new_model/chunk_268.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f2ac11652f49e188548610601957b507609bc93 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_268.txt @@ -0,0 +1,2 @@ +A template of those model tests has already added by the +Cookiecutter, called BrandNewBertModelIntegrationTests and only has to be filled out by you. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_269.txt b/chunked/content_aware_chunking/_add_new_model/chunk_269.txt new file mode 100644 index 0000000000000000000000000000000000000000..e654211eb62a1a8ebab5bf44f923b22e6f2d0091 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_269.txt @@ -0,0 +1,9 @@ +To ensure that those +tests are passing, run + +RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests + +In case you are using Windows, you should replace RUN_SLOW=1 with SET RUN_SLOW=1 + +Second, all features that are special to brand_new_bert should be tested additionally in a separate test under +BrandNewBertModelTester/`BrandNewBertModelTest. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_270.txt b/chunked/content_aware_chunking/_add_new_model/chunk_270.txt new file mode 100644 index 0000000000000000000000000000000000000000..44543c5d3fbdd372959ff16a449275b46155b0a1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_270.txt @@ -0,0 +1,5 @@ +This part is often forgotten but is extremely useful in two +ways: + +It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the + special features of brand_new_bert should work. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_271.txt b/chunked/content_aware_chunking/_add_new_model/chunk_271.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5bdd31a1db587385da14218cb6f0042ba0954e0 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_271.txt @@ -0,0 +1 @@ +Future contributors can quickly test changes to the model by running those special tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_272.txt b/chunked/content_aware_chunking/_add_new_model/chunk_272.txt new file mode 100644 index 0000000000000000000000000000000000000000..0aff809b3769023dead0b37953bdd25cdb5424f0 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_272.txt @@ -0,0 +1 @@ +9. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_273.txt b/chunked/content_aware_chunking/_add_new_model/chunk_273.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ad7893e75a41834cede37b5042e052f0ccd7489 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_273.txt @@ -0,0 +1,2 @@ +Implement the tokenizer +Next, we should add the tokenizer of brand_new_bert. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_274.txt b/chunked/content_aware_chunking/_add_new_model/chunk_274.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fea9ae7d9f496c098bc1a47138dead143b8bdcf --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_274.txt @@ -0,0 +1,2 @@ +Usually, the tokenizer is equivalent to or very similar to an +already existing tokenizer of 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_275.txt b/chunked/content_aware_chunking/_add_new_model/chunk_275.txt new file mode 100644 index 0000000000000000000000000000000000000000..4059803e2788a3deda6ba4e18f4a685e0a846468 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_275.txt @@ -0,0 +1,2 @@ +It is very important to find/extract the original tokenizer file and to manage to load this file into the 🤗 +Transformers' implementation of the tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_276.txt b/chunked/content_aware_chunking/_add_new_model/chunk_276.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f15774b425f6c06869d89ed32ae5657f9d8606d --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_276.txt @@ -0,0 +1,2 @@ +To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository +that inputs a string and returns the `input_ids``. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_277.txt b/chunked/content_aware_chunking/_add_new_model/chunk_277.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c2faf4a8cb30212f55523e2e513b7322b794af5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_277.txt @@ -0,0 +1,3 @@ +It could look similar to this (in pseudo-code): +python +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_278.txt b/chunked/content_aware_chunking/_add_new_model/chunk_278.txt new file mode 100644 index 0000000000000000000000000000000000000000..80d30b3c7be95c2072776f4a8b9228c44517de93 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_278.txt @@ -0,0 +1,4 @@ +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = model.tokenize(input_str) +You might have to take a deeper look again into the original repository to find the correct tokenizer function or you +might even have to do changes to your clone of the original repository to only output the input_ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_279.txt b/chunked/content_aware_chunking/_add_new_model/chunk_279.txt new file mode 100644 index 0000000000000000000000000000000000000000..34c17bb5fd0d16297ca79a6a8a0c366c808ae198 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_279.txt @@ -0,0 +1,3 @@ +Having written +a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be +created. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_280.txt b/chunked/content_aware_chunking/_add_new_model/chunk_280.txt new file mode 100644 index 0000000000000000000000000000000000000000..96c7d8a1ff9685303b226bc1ceafb5967ebd3d23 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_280.txt @@ -0,0 +1,4 @@ +It should look similar to this: +thon +from transformers import BrandNewBertTokenizer +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_281.txt b/chunked/content_aware_chunking/_add_new_model/chunk_281.txt new file mode 100644 index 0000000000000000000000000000000000000000..79c53e2edf5abb274de8ce730a346ec3786009df --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_281.txt @@ -0,0 +1,4 @@ +tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/") +input_ids = tokenizer(input_str).input_ids + +When both input_ids yield the same values, as a final step a tokenizer test file should also be added. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_282.txt b/chunked/content_aware_chunking/_add_new_model/chunk_282.txt new file mode 100644 index 0000000000000000000000000000000000000000..b11c99fcc271961d1c6f3f04cc04f5b83a4bd2a8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_282.txt @@ -0,0 +1,2 @@ +Analogous to the modeling test files of brand_new_bert, the tokenization test files of brand_new_bert should +contain a couple of hard-coded integration tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_283.txt b/chunked/content_aware_chunking/_add_new_model/chunk_283.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec4cedfd09d63025d16d23d232c2c6413b1edfb0 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_283.txt @@ -0,0 +1 @@ +10. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_284.txt b/chunked/content_aware_chunking/_add_new_model/chunk_284.txt new file mode 100644 index 0000000000000000000000000000000000000000..f670dbee9b0127e83ded4eb2a59575fdad32446b --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_284.txt @@ -0,0 +1,3 @@ +Run End-to-end integration tests +Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the +tokenizer to tests/models/brand_new_bert/test_modeling_brand_new_bert.py in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_285.txt b/chunked/content_aware_chunking/_add_new_model/chunk_285.txt new file mode 100644 index 0000000000000000000000000000000000000000..576b7829932631d8740e1eab970ead3df881fce8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_285.txt @@ -0,0 +1,2 @@ +Such a test should show on a meaningful +text-to-text sample that the 🤗 Transformers implementation works as expected. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_286.txt b/chunked/content_aware_chunking/_add_new_model/chunk_286.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c9d187405eb4eb15e12fed384f0cdb33e6b58c6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_286.txt @@ -0,0 +1,2 @@ +A meaningful text-to-text sample can +include e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_287.txt b/chunked/content_aware_chunking/_add_new_model/chunk_287.txt new file mode 100644 index 0000000000000000000000000000000000000000..85e7f48718c7b5f56b3efbfdea6ab761a1a4265f --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_287.txt @@ -0,0 +1,2 @@ +a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none +of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_288.txt b/chunked/content_aware_chunking/_add_new_model/chunk_288.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4d287141bffbd0c1d4a6fde419ea785bf9384cf --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_288.txt @@ -0,0 +1,2 @@ +In a +final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_289.txt b/chunked/content_aware_chunking/_add_new_model/chunk_289.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0b4f3f9c70a931c3ea86c301cb2b17bd02beaa9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_289.txt @@ -0,0 +1,3 @@ +It can +happen that you forgot to add some .to(self.device) statements to internal tensors of the model, which in such a +test would show in an error. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_290.txt b/chunked/content_aware_chunking/_add_new_model/chunk_290.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1120a876fa4559e9cdcdf350e134708324daf74 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_290.txt @@ -0,0 +1,2 @@ +In case you have no access to a GPU, the Hugging Face team can take care of running those +tests for you. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_291.txt b/chunked/content_aware_chunking/_add_new_model/chunk_291.txt new file mode 100644 index 0000000000000000000000000000000000000000..d770ba86d9b4b4ab940ea8a80d504c91cae703d5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_291.txt @@ -0,0 +1 @@ +11. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_292.txt b/chunked/content_aware_chunking/_add_new_model/chunk_292.txt new file mode 100644 index 0000000000000000000000000000000000000000..b77ae84df062e6542b78e150cee19af1242a4c5b --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_292.txt @@ -0,0 +1,2 @@ +Add Docstring +Now, all the necessary functionality for brand_new_bert is added - you're almost done! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_293.txt b/chunked/content_aware_chunking/_add_new_model/chunk_293.txt new file mode 100644 index 0000000000000000000000000000000000000000..0007cda45558597c16ecffb2bccee9146d667d48 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_293.txt @@ -0,0 +1,2 @@ +The only thing left to add is +a nice docstring and a doc page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_294.txt b/chunked/content_aware_chunking/_add_new_model/chunk_294.txt new file mode 100644 index 0000000000000000000000000000000000000000..550837612cce041bfbbb93466c3652dd58872aa1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_294.txt @@ -0,0 +1,2 @@ +The Cookiecutter should have added a template file called +docs/source/model_doc/brand_new_bert.md that you should fill out. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_295.txt b/chunked/content_aware_chunking/_add_new_model/chunk_295.txt new file mode 100644 index 0000000000000000000000000000000000000000..40ca9582374de3c1ac953ff024a15b69d8b44e51 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_295.txt @@ -0,0 +1,2 @@ +Users of your model will usually first look at +this page before using your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_296.txt b/chunked/content_aware_chunking/_add_new_model/chunk_296.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb90bc77075d3f8842bd017770a0475dd7ad9319 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_296.txt @@ -0,0 +1 @@ +Hence, the documentation must be understandable and concise. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_297.txt b/chunked/content_aware_chunking/_add_new_model/chunk_297.txt new file mode 100644 index 0000000000000000000000000000000000000000..839648fbf3b8084acccd46f62f675478ce1e0b93 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_297.txt @@ -0,0 +1,2 @@ +It is very useful for +the community to add some Tips to show how the model should be used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_298.txt b/chunked/content_aware_chunking/_add_new_model/chunk_298.txt new file mode 100644 index 0000000000000000000000000000000000000000..bda19b5e4e04aaf6649859e72c9cd72929ea8df9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_298.txt @@ -0,0 +1,2 @@ +Don't hesitate to ping the Hugging Face team +regarding the docstrings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_299.txt b/chunked/content_aware_chunking/_add_new_model/chunk_299.txt new file mode 100644 index 0000000000000000000000000000000000000000..090251aba1a7cc5cda60131c0647478ad63651d1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_299.txt @@ -0,0 +1,2 @@ +Next, make sure that the docstring added to src/transformers/models/brand_new_bert/modeling_brand_new_bert.py is +correct and included all necessary inputs and outputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_300.txt b/chunked/content_aware_chunking/_add_new_model/chunk_300.txt new file mode 100644 index 0000000000000000000000000000000000000000..24ce4343e65b38febcfb07c7c6464f0df506a3df --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_300.txt @@ -0,0 +1 @@ +We have a detailed guide about writing documentation and our docstring format here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_301.txt b/chunked/content_aware_chunking/_add_new_model/chunk_301.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2747691572eeac607aa5cb4600c5f8d071d8b3c --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_301.txt @@ -0,0 +1,3 @@ +It is always to good to remind oneself that documentation should +be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact +point of the community with the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_302.txt b/chunked/content_aware_chunking/_add_new_model/chunk_302.txt new file mode 100644 index 0000000000000000000000000000000000000000..0335df8d74029af190951950fc5c373aa6e37e13 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_302.txt @@ -0,0 +1,2 @@ +Code refactor +Great, now you have added all the necessary code for brand_new_bert. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_303.txt b/chunked/content_aware_chunking/_add_new_model/chunk_303.txt new file mode 100644 index 0000000000000000000000000000000000000000..f82861a38fcb62f171edcb71cb2bac0ee8455cf3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_303.txt @@ -0,0 +1,9 @@ +At this point, you should correct some potential +incorrect code style by running: + +make style +and verify that your coding style passes the quality check: + +make quality +There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in +the tests of your pull request. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_304.txt b/chunked/content_aware_chunking/_add_new_model/chunk_304.txt new file mode 100644 index 0000000000000000000000000000000000000000..53d00a917c0ef44d2527885bb0d5011b1a7a203a --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_304.txt @@ -0,0 +1,2 @@ +This is often because of some missing information in the docstring or some incorrect +naming. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_305.txt b/chunked/content_aware_chunking/_add_new_model/chunk_305.txt new file mode 100644 index 0000000000000000000000000000000000000000..c47f0f8aa43cfbba6d8676f8585f4bbe7d0ba59b --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_305.txt @@ -0,0 +1 @@ +The Hugging Face team will surely help you if you're stuck here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_306.txt b/chunked/content_aware_chunking/_add_new_model/chunk_306.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf0efee1ef1148c68368de450bc1ea3f7b92ef5e --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_306.txt @@ -0,0 +1 @@ +Lastly, it is always a good idea to refactor one's code after having ensured that the code works correctly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_307.txt b/chunked/content_aware_chunking/_add_new_model/chunk_307.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e7cd7962f82eda0b79132cbc7ab2007a0e0c884 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_307.txt @@ -0,0 +1,2 @@ +With all +tests passing, now it's a good time to go over the added code again and do some refactoring. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_308.txt b/chunked/content_aware_chunking/_add_new_model/chunk_308.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dd82cdfa8da6028f9a94355141ac8611effc097 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_308.txt @@ -0,0 +1 @@ +You have now finished the coding part, congratulation! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_309.txt b/chunked/content_aware_chunking/_add_new_model/chunk_309.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ae67e365ea99cef60ea2f43cc5f12c5eede2af2 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_309.txt @@ -0,0 +1 @@ +🎉 You are Awesome! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_310.txt b/chunked/content_aware_chunking/_add_new_model/chunk_310.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e2a219c84c629d5b0483a57052df69483e78740 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_310.txt @@ -0,0 +1,2 @@ +😎 +12. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_311.txt b/chunked/content_aware_chunking/_add_new_model/chunk_311.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d0de9e0fd0c1ef31e84ff2079cdd8c4a753b5ca --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_311.txt @@ -0,0 +1,3 @@ +Upload the models to the model hub +In this final part, you should convert and upload all checkpoints to the model hub and add a model card for each +uploaded model checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_312.txt b/chunked/content_aware_chunking/_add_new_model/chunk_312.txt new file mode 100644 index 0000000000000000000000000000000000000000..713b52a8adffb53f3ee568525723228a61f0a0e1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_312.txt @@ -0,0 +1 @@ +You can get familiar with the hub functionalities by reading our Model sharing and uploading Page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_313.txt b/chunked/content_aware_chunking/_add_new_model/chunk_313.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a05a8962d32114caaf06f5b2c02be27475636fb --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_313.txt @@ -0,0 +1,3 @@ +You should work alongside the Hugging Face team here to decide on a fitting name for each +checkpoint and to get the required access rights to be able to upload the model under the author's organization of +brand_new_bert. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_314.txt b/chunked/content_aware_chunking/_add_new_model/chunk_314.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8984ec7ba734d22440a1b14d01565fc8b0c40d8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_314.txt @@ -0,0 +1 @@ +The push_to_hub method, present in all models in transformers, is a quick and efficient way to push your checkpoint to the hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_315.txt b/chunked/content_aware_chunking/_add_new_model/chunk_315.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0f3538920f8862878c0b7135a3e88afe0c659fd --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_315.txt @@ -0,0 +1,4 @@ +A little snippet is pasted below: +thon +brand_new_bert.push_to_hub("brand_new_bert") +Uncomment the following line to push to an organization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_316.txt b/chunked/content_aware_chunking/_add_new_model/chunk_316.txt new file mode 100644 index 0000000000000000000000000000000000000000..8818fc085a549b469675446fa5e97c03229fee41 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_316.txt @@ -0,0 +1,3 @@ +brand_new_bert.push_to_hub("/brand_new_bert") + +It is worth spending some time to create fitting model cards for each checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_317.txt b/chunked/content_aware_chunking/_add_new_model/chunk_317.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0729992d1d617bcbb0de0d1112a0ee49359efad --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_317.txt @@ -0,0 +1,2 @@ +The model cards should highlight the +specific characteristics of this particular checkpoint, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_318.txt b/chunked/content_aware_chunking/_add_new_model/chunk_318.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b3301e48f5ab92927b3f3eba3658a884c6f813e --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_318.txt @@ -0,0 +1,2 @@ +On which dataset was the checkpoint +pretrained/fine-tuned on? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_319.txt b/chunked/content_aware_chunking/_add_new_model/chunk_319.txt new file mode 100644 index 0000000000000000000000000000000000000000..d609335bc61db1e073725c33485200ddcb646eb0 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_319.txt @@ -0,0 +1 @@ +On what down-stream task should the model be used? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_320.txt b/chunked/content_aware_chunking/_add_new_model/chunk_320.txt new file mode 100644 index 0000000000000000000000000000000000000000..002db8d48b027fed584b8ced9486c068ffb893f0 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_320.txt @@ -0,0 +1,2 @@ +And also include some code on how to +correctly use the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_321.txt b/chunked/content_aware_chunking/_add_new_model/chunk_321.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4b697a804065352582ff23c4c6d95ea1a00c785 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_321.txt @@ -0,0 +1 @@ +13. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_322.txt b/chunked/content_aware_chunking/_add_new_model/chunk_322.txt new file mode 100644 index 0000000000000000000000000000000000000000..59226f52d44cc97273c6fb0263be028b9d2e965c --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_322.txt @@ -0,0 +1,3 @@ +(Optional) Add notebook +It is very helpful to add a notebook that showcases in-detail how brand_new_bert can be used for inference and/or +fine-tuned on a downstream task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_323.txt b/chunked/content_aware_chunking/_add_new_model/chunk_323.txt new file mode 100644 index 0000000000000000000000000000000000000000..a03d904ea000d631e02f9fb7002b0d5e50827e8d --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_323.txt @@ -0,0 +1 @@ +This is not mandatory to merge your PR, but very useful for the community. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_324.txt b/chunked/content_aware_chunking/_add_new_model/chunk_324.txt new file mode 100644 index 0000000000000000000000000000000000000000..018c4e0bccf367bef6943eac1440ba0e3742f713 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_324.txt @@ -0,0 +1 @@ +14. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_325.txt b/chunked/content_aware_chunking/_add_new_model/chunk_325.txt new file mode 100644 index 0000000000000000000000000000000000000000..85e2c0109c0800dc999b3c7cf1e109a38c7d7480 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_325.txt @@ -0,0 +1,2 @@ +Submit your finished PR +You're done programming now and can move to the last step, which is getting your PR merged into main. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_326.txt b/chunked/content_aware_chunking/_add_new_model/chunk_326.txt new file mode 100644 index 0000000000000000000000000000000000000000..ceda2b4504743e0907f6a687388807ee544e314c --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_326.txt @@ -0,0 +1,4 @@ +Usually, the +Hugging Face team should have helped you already at this point, but it is worth taking some time to give your finished +PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your +reviewer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_327.txt b/chunked/content_aware_chunking/_add_new_model/chunk_327.txt new file mode 100644 index 0000000000000000000000000000000000000000..5369adc51be9fc1893ad645f60485ff996ee8fc7 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_327.txt @@ -0,0 +1 @@ +Share your work!! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_328.txt b/chunked/content_aware_chunking/_add_new_model/chunk_328.txt new file mode 100644 index 0000000000000000000000000000000000000000..204024c8e395de52eeb28567f7604b836b1fd52c --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_328.txt @@ -0,0 +1 @@ +Now, it's time to get some credit from the community for your work! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_329.txt b/chunked/content_aware_chunking/_add_new_model/chunk_329.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c6ccb722d86854474075977b3248627df8aa345 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_329.txt @@ -0,0 +1,2 @@ +Having completed a model addition is a major +contribution to Transformers and the whole NLP community. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_330.txt b/chunked/content_aware_chunking/_add_new_model/chunk_330.txt new file mode 100644 index 0000000000000000000000000000000000000000..167331c483c14e0872de2c70e95386e1cd39bbe1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_330.txt @@ -0,0 +1,2 @@ +Your code and the ported pre-trained models will certainly be +used by hundreds and possibly even thousands of developers and researchers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_331.txt b/chunked/content_aware_chunking/_add_new_model/chunk_331.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e92e1f3e471c5af9219872646f5bf16007a51cf --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_331.txt @@ -0,0 +1,2 @@ +You should be proud of your work and share +your achievements with the community. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_332.txt b/chunked/content_aware_chunking/_add_new_model/chunk_332.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd48cad6b4bb422158574de2de945cb9c3eb149f --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_332.txt @@ -0,0 +1 @@ +You have made another model that is super easy to access for everyone in the community! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_model/chunk_333.txt b/chunked/content_aware_chunking/_add_new_model/chunk_333.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ae2b406f4187acd1551693a6a5775cfabc54ff4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_model/chunk_333.txt @@ -0,0 +1 @@ +🤯 \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_17.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b22bd0976e504b590b5c5bb70ba9faeacf89363 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_17.txt @@ -0,0 +1,2 @@ +_sanitize_parameters exists to allow users to pass any parameters whenever they wish, be it at initialization +time pipeline(., maybe_arg=4) or at call time pipe = pipeline(); output = pipe(., maybe_arg=4). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_18.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba758c09d5c8a9e328cc4820e5d729828ec11aba --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_18.txt @@ -0,0 +1,2 @@ +The returns of _sanitize_parameters are the 3 dicts of kwargs that will be passed directly to preprocess, +_forward, and postprocess. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_19.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b016b7f50761e4c0e656e2c6fe5c68aba08fbc2 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_19.txt @@ -0,0 +1 @@ +Don't fill anything if the caller didn't call with any extra parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_20.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..12d184892f31c722b557c09a365fc7f70fec6468 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_20.txt @@ -0,0 +1,2 @@ +That +allows to keep the default arguments in the function definition which is always more "natural". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_21.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d889d7eb6725eaad5aee7e7bb6c9a4361c562847 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_21.txt @@ -0,0 +1 @@ +A classic example would be a top_k argument in the post processing in classification tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_22.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..261585d58d734984e5287dea41c18b8c22eb8ce2 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_22.txt @@ -0,0 +1,11 @@ +thon + +pipe = pipeline("my-new-task") +pipe("This is a test") +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} +{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] +pipe("This is a test", top_k=2) +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] + +In order to achieve that, we'll update our postprocess method with a default parameter to 5. and edit +_sanitize_parameters to allow this new parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_23.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..11e01d97a16b3192d03e621b76e0fb618014e807 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_23.txt @@ -0,0 +1,16 @@ +thon +def postprocess(self, model_outputs, top_k=5): + best_class = model_outputs["logits"].softmax(-1) + # Add logic to handle top_k + return best_class +def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] +postprocess_kwargs = {} +if "top_k" in kwargs: + postprocess_kwargs["top_k"] = kwargs["top_k"] +return preprocess_kwargs, {}, postprocess_kwargs + +Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy +without requiring users to understand new kinds of objects. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_24.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..83eabcb6047becdec5f9cbcce9295f8881853a36 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_24.txt @@ -0,0 +1,24 @@ +It's also relatively common to support many different types +of arguments for ease of use (audio files, which can be filenames, URLs or pure bytes) +Adding it to the list of supported tasks +To register your new-task to the list of supported tasks, you have to add it to the PIPELINE_REGISTRY: +thon +from transformers.pipelines import PIPELINE_REGISTRY +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, +) + +You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took "abcdef") as well as the type: +python +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, + default={"pt": ("user/awesome_model", "abcdef")}, + type="text", # current support type: text, audio, image, multimodal +) +Share your pipeline on the Hub +To share your custom pipeline on the Hub, you just have to save the custom code of your Pipeline subclass in a +python file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_25.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..1658cdaf72837a9afb89c256120e40a686b91508 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_25.txt @@ -0,0 +1,31 @@ +For instance, let's say we want to use a custom pipeline for sentence pair classification like this: + +import numpy as np +from transformers import Pipeline +def softmax(outputs): + maxes = np.max(outputs, axis=-1, keepdims=True) + shifted_exp = np.exp(outputs - maxes) + return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) +class PairClassificationPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "second_text" in kwargs: + preprocess_kwargs["second_text"] = kwargs["second_text"] + return preprocess_kwargs, {}, {} +def preprocess(self, text, second_text=None): + return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework) + +def _forward(self, model_inputs): + return self.model(**model_inputs) + +def postprocess(self, model_outputs): + logits = model_outputs.logits[0].numpy() + probabilities = softmax(logits) + + best_class = np.argmax(probabilities) + label = self.model.config.id2label[best_class] + score = probabilities[best_class].item() + logits = logits.tolist() + return {"label": label, "score": score, "logits": logits} + +The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_26.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac70d3913dda9fbb1781e8b5cc7da459579180bc --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_26.txt @@ -0,0 +1,14 @@ +If we have saved this in +a file named pair_classification.py, we can then import it and register it like this: + +from pair_classification import PairClassificationPipeline +from transformers.pipelines import PIPELINE_REGISTRY +from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification +PIPELINE_REGISTRY.register_pipeline( + "pair-classification", + pipeline_class=PairClassificationPipeline, + pt_model=AutoModelForSequenceClassification, + tf_model=TFAutoModelForSequenceClassification, +) + +Once this is done, we can use it with a pretrained model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_27.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f432f402363e6b33cc7ff30e0ae398be31d5487 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_27.txt @@ -0,0 +1,2 @@ +For instance sgugger/finetuned-bert-mrpc has been +fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_28.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5a31cfd5eb59586a503e496b0cebf8915319385 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_28.txt @@ -0,0 +1,13 @@ +from transformers import pipeline +classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") + +Then we can share it on the Hub by using the save_pretrained method in a Repository: + +from huggingface_hub import Repository +repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline") +classifier.save_pretrained("test-dynamic-pipeline") +repo.push_to_hub() + +This will copy the file where you defined PairClassificationPipeline inside the folder "test-dynamic-pipeline", +along with saving the model and tokenizer of the pipeline, before pushing everything into the repository +{your_username}/test-dynamic-pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_29.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..99ad4e27a325273d86865a70b270af32f9cfe651 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_29.txt @@ -0,0 +1,9 @@ +After that, anyone can use it as long as they provide the option +trust_remote_code=True: + +from transformers import pipeline +classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) + +Add the pipeline to 🤗 Transformers +If you want to contribute your pipeline to 🤗 Transformers, you will need to add a new module in the pipelines submodule +with the code of your pipeline, then add it to the list of tasks defined in pipelines/__init__.py. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_30.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..5431e984d61470d30ff253f0d0afa27614344bbd --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_30.txt @@ -0,0 +1 @@ +Then you will need to add tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_31.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f79de4953595133cf3bd26fc69010952006afc6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_31.txt @@ -0,0 +1 @@ +Create a new file tests/test_pipelines_MY_PIPELINE.py with examples of the other tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_32.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..af4f3b9e0a46eb942329d6e61e4653db62cc5e97 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_32.txt @@ -0,0 +1,2 @@ +The run_pipeline_test function will be very generic and run on small random models on every possible +architecture as defined by model_mapping and tf_model_mapping. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_33.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd66496f754123fda6ccb6ae9371e601bf241fc4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_33.txt @@ -0,0 +1,2 @@ +This is very important to test future compatibility, meaning if someone adds a new model for +XXXForQuestionAnswering then the pipeline test will attempt to run on it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_34.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..12db79ba21306a54e7ee497171a6128bc8186be2 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_34.txt @@ -0,0 +1,3 @@ +Because the models are random it's +impossible to check for actual values, that's why there is a helper ANY that will simply attempt to match the +output of the pipeline TYPE. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_35.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..03dd3325640275838b7e123cdc05d7e31f690e26 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_35.txt @@ -0,0 +1 @@ +You also need to implement 2 (ideally 4) tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_36.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..149d78103129a2bc8782e8ef440a073197a4a8fa --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_36.txt @@ -0,0 +1,2 @@ +test_small_model_pt : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) + and test the pipeline outputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_37.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..74dc6458399c0ac932e0894bd97ccc4691f80b7f --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_37.txt @@ -0,0 +1 @@ +The results should be the same as test_small_model_tf. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_38.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa627370b34f1cfdebed2bb0925bf915d554a74a --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_38.txt @@ -0,0 +1,2 @@ +test_small_model_tf : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) + and test the pipeline outputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_39.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb3b2581e0ec604f2ec53114d3bd9028862d56c2 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_39.txt @@ -0,0 +1 @@ +The results should be the same as test_small_model_pt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_40.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecde8ea2a7b3e642a501b74cb589a96d57ca00a9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_40.txt @@ -0,0 +1,2 @@ +test_large_model_pt (optional): Tests the pipeline on a real pipeline where the results are supposed to + make sense. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_41.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..95bc7e94d74f2985c197f0f3e20a41aeea292156 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_41.txt @@ -0,0 +1 @@ +These tests are slow and should be marked as such. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_42.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..e79661ac7aa6ccabdfe21b4a4ca6380591e551c3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_42.txt @@ -0,0 +1,2 @@ +Here the goal is to showcase the pipeline and to make + sure there is no drift in future releases. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_43.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc86cbca1ed6adeefaf3a5119ba7f245437fd2d5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_43.txt @@ -0,0 +1,2 @@ +test_large_model_tf (optional): Tests the pipeline on a real pipeline where the results are supposed to + make sense. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_44.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..95bc7e94d74f2985c197f0f3e20a41aeea292156 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_44.txt @@ -0,0 +1 @@ +These tests are slow and should be marked as such. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_new_pipeline/chunk_45.txt b/chunked/content_aware_chunking/_add_new_pipeline/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..e79661ac7aa6ccabdfe21b4a4ca6380591e551c3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_new_pipeline/chunk_45.txt @@ -0,0 +1,2 @@ +Here the goal is to showcase the pipeline and to make + sure there is no drift in future releases. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_100.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..949f8cce309b723d5948b1829df18ed0e07820f9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_100.txt @@ -0,0 +1,2 @@ +TFBrandNewBertMainLayer is decorated with @keras_serializable +5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_101.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..6239e7b829197d8901d2bfe6d68c6a34e21e6a91 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_101.txt @@ -0,0 +1,2 @@ +A TensorFlow model can be loaded from PyTorch weights using TFBrandNewBert.from_pretrained(model_repo, from_pt=True) +6. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_102.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbb09f31aa0b166332634b1c58e282cedd37a071 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_102.txt @@ -0,0 +1,2 @@ +You can call the TensorFlow model using the expected input format +5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_103.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4e564c9fe71ebe3f3b40f9319130139a7482754 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_103.txt @@ -0,0 +1,2 @@ +Add model tests +Hurray, you've implemented a TensorFlow model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_104.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..6189dd6effe391e7c25f0b6a5f9d8232b5e2519b --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_104.txt @@ -0,0 +1,2 @@ +Now it's time to add tests to make sure that your model behaves as +expected. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_105.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..a78b7f2efca28b1b7ae4b43f5e84b020f00bf00e --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_105.txt @@ -0,0 +1,3 @@ +As in the previous section, we suggest you start by copying the test_modeling_brand_new_bert.py file in +tests/models/brand_new_bert/ into test_modeling_tf_brand_new_bert.py, and continue by making the necessary +TensorFlow replacements. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_106.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..d84690c1fe9b989bdad93c417df7c89e1abc114c --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_106.txt @@ -0,0 +1,2 @@ +For now, in all .from_pretrained() calls, you should use the from_pt=True flag to load +the existing PyTorch weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_107.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..889cb0a06a4a4579d670bdb2e253dccd959e9eca --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_107.txt @@ -0,0 +1 @@ +After you're done, it's time for the moment of truth: run the tests! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_108.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e3aac92f444cf3ae83f7ccd14caaf095171eb37 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_108.txt @@ -0,0 +1,5 @@ +😬 + +NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ +py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py +The most likely outcome is that you'll see a bunch of errors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_109.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..a402ea336388d0ee320ffcd5d8cb0dac607f371b --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_109.txt @@ -0,0 +1 @@ +Don't worry, this is expected! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_110.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d9203a7be417b3d8fea9ea04664f8aa64d17e12 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_110.txt @@ -0,0 +1,2 @@ +Debugging ML models is +notoriously hard, and the key ingredient to success is patience (and breakpoint()). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_111.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6eeb134627c3756f68a30189c2ba122b3990a36 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_111.txt @@ -0,0 +1,2 @@ +In our experience, the hardest +problems arise from subtle mismatches between ML frameworks, for which we have a few pointers at the end of this guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_112.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..6638b1e2ebf9b0f087b99b531f8932b4acb810f6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_112.txt @@ -0,0 +1,2 @@ +In other cases, a general test might not be directly applicable to your model, in which case we suggest an override +at the model test class level. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_113.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..36f0d2526f9b8c63cebe1915de9d8164566a922b --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_113.txt @@ -0,0 +1,2 @@ +Regardless of the issue, don't hesitate to ask for help in your draft pull request if +you're stuck. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_114.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..638960ee5bfc24906ba28dd7f53a8b738e217058 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_114.txt @@ -0,0 +1 @@ +When all tests pass, congratulations, your model is nearly ready to be added to the 🤗 Transformers library! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_115.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..a17a00b1ad605a3b841c0963e90f6d0b6985c2bf --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_115.txt @@ -0,0 +1,2 @@ +🎉 +6.-7. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_116.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..cae3a8d4f01746b7de126867d21602fe21b9c2d9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_116.txt @@ -0,0 +1,2 @@ +Ensure everyone can use your model +6. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_117.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf145b8eef0150e6d44018d5923472f86a8e9f50 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_117.txt @@ -0,0 +1,2 @@ +Submit the pull request +Once you're done with the implementation and the tests, it's time to submit a pull request. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_118.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..751a69c310d820717255d7be862ba9ec4fb61d02 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_118.txt @@ -0,0 +1,2 @@ +Before pushing your code, +run our code formatting utility, make fixup 🪄. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_119.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..716ec7c5c2753b97822ae07f58d0af98b192e2e0 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_119.txt @@ -0,0 +1,2 @@ +This will automatically fix any formatting issues, which would cause +our automatic checks to fail. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_120.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..18536e2d2f8739922dd5632a6570395dab7a4ccd --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_120.txt @@ -0,0 +1 @@ +It's now time to convert your draft pull request into a real pull request. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_121.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec36a9ab4e83d35e3df02da98de85d121c23e47c --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_121.txt @@ -0,0 +1,2 @@ +To do so, click on the "Ready for +review" button and add Joao (@gante) and Matt (@Rocketknight1) as reviewers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_122.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5aed48b46731997b9ff4039469e058da7c0202a --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_122.txt @@ -0,0 +1,2 @@ +A model pull request will need +at least 3 reviewers, but they will take care of finding appropriate additional reviewers for your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_123.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..85884b5507361dccf3689604aee7daf6cc844dfc --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_123.txt @@ -0,0 +1,2 @@ +After all reviewers are happy with the state of your PR, the final action point is to remove the from_pt=True flag in +.from_pretrained() calls. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_124.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..78fb02ebfd6af6d8642fe17d3cbbdcf77807ee27 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_124.txt @@ -0,0 +1 @@ +Since there are no TensorFlow weights, you will have to add them! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_125.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5959261da66860eecef21e94f3c003ae1a9e0b5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_125.txt @@ -0,0 +1,2 @@ +Check the section +below for instructions on how to do it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_126.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fb3572c622c25a4f4bfbcb9f11ddad3c052acb3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_126.txt @@ -0,0 +1,6 @@ +Finally, when the TensorFlow weights get merged, you have at least 3 reviewer approvals, and all CI checks are +green, double-check the tests locally one last time + +NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ +py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py +and we will merge your PR! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_127.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f569a995a77585a4de800e2457d1221fc0151bb --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_127.txt @@ -0,0 +1,2 @@ +Congratulations on the milestone 🎉 +7. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_128.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..adf410a2780439e0b86fdbde91e48d362d79bd47 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_128.txt @@ -0,0 +1,2 @@ +(Optional) Build demos and share with the world +One of the hardest parts about open-source is discovery. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_129.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4057a889255a97b6492fa2786d312a5596edc32 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_129.txt @@ -0,0 +1,2 @@ +How can the other users learn about the existence of your +fabulous TensorFlow contribution? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_130.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b7e253639079eac07753fc97a9b65001f2e1215 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_130.txt @@ -0,0 +1 @@ +With proper communication, of course! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_131.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a7843cbb412993439bccb19e21d71297eb3c10f --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_131.txt @@ -0,0 +1,3 @@ +📣 +There are two main ways to share your model with the community: +- Build demos. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_132.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..466dd2bb10482cea7b85bfcdd7b1a45cdcbe3b0e --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_132.txt @@ -0,0 +1 @@ +These include Gradio demos, notebooks, and other fun ways to show off your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_133.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..8db5f9f027e780be6900ee410b901bde5cacb9b8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_133.txt @@ -0,0 +1,2 @@ +We highly + encourage you to add a notebook to our community-driven demos. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_134.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9e716f1a24a21f96ce595ca5201e660835b17c9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_134.txt @@ -0,0 +1 @@ +- Share stories on social media like Twitter and LinkedIn. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_135.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c69644ba29ad2031238833ff04e9b62f181cf49 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_135.txt @@ -0,0 +1,3 @@ +You should be proud of your work and share + your achievement with the community - your model can now be used by thousands of engineers and researchers around + the world ðŸŒ! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_136.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6f912b0481f352044e991da747d44b13332ee9c --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_136.txt @@ -0,0 +1 @@ +We will be happy to retweet your posts and help you share your work with the community. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_137.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b4b8adf309083d98eb0d2b558105ffd577b5edd --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_137.txt @@ -0,0 +1,3 @@ +Adding TensorFlow weights to 🤗 Hub +Assuming that the TensorFlow model architecture is available in 🤗 Transformers, converting PyTorch weights into +TensorFlow weights is a breeze! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_138.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7e02e1b2b5f46120f774acf9fc15493d6b95f4d --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_138.txt @@ -0,0 +1,2 @@ +Here's how to do it: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_139.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..35fe0b0919af36a70c44be91a315f2559b6e6534 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_139.txt @@ -0,0 +1 @@ +Make sure you are logged into your Hugging Face account in your terminal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_140.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..2740121e15d8c6977403461a3a93bb65a7f51e88 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_140.txt @@ -0,0 +1,3 @@ +You can log in using the command + huggingface-cli login (you can find your access tokens here) +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_141.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..a34098fb57044b4fc1d413642095f05db262b0dc --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_141.txt @@ -0,0 +1,3 @@ +Run transformers-cli pt-to-tf --model-name foo/bar, where foo/bar is the name of the model repository + containing the PyTorch weights you want to convert +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_142.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..2daf6e21fd685783df81ef60ade651f4f24df032 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_142.txt @@ -0,0 +1,2 @@ +Tag @joaogante and @Rocketknight1 in the 🤗 Hub PR the command above has just created +That's it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_143.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9c688524a00285878a654a53e574c2949aaf398 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_143.txt @@ -0,0 +1,4 @@ +🎉 +Debugging mismatches across ML frameworks 🛠+At some point, when adding a new architecture or when creating TensorFlow weights for an existing architecture, you +might come across errors complaining about mismatches between PyTorch and TensorFlow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_144.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4189257c01c387cb961896a348c82041d871906 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_144.txt @@ -0,0 +1,2 @@ +You might even decide to open the +model architecture code for the two frameworks, and find that they look identical. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_145.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..51d48ab88cb5ca75456f18828a335206538c77e0 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_145.txt @@ -0,0 +1 @@ +What's going on? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_146.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e56fb156c03c2d3ffc53d1d7811650b8bdbf370 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_146.txt @@ -0,0 +1,2 @@ +🤔 +First of all, let's talk about why understanding these mismatches matters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_147.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..f855fcfd2555e5d3151243c07b3d2a1483fd8146 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_147.txt @@ -0,0 +1,2 @@ +Many community members will use 🤗 +Transformers models out of the box, and trust that our models behave as expected. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_148.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..1093d338b94c26279a8aa4f44dd216e650363ae6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_148.txt @@ -0,0 +1,3 @@ +When there is a large mismatch +between the two frameworks, it implies that the model is not following the reference implementation for at least one +of the frameworks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_149.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..adfe557ff1645f60e7605cc1fc3263c3e81a1af9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_149.txt @@ -0,0 +1 @@ +This might lead to silent failures, in which the model runs but has poor performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_150.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..fae4c504acdbb1dfdad9389855f33d6d2d0cb2b9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_150.txt @@ -0,0 +1,2 @@ +This is +arguably worse than a model that fails to run at all! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_151.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fb804117e992ed1ba34626b85ac3c2a5dedd6bd --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_151.txt @@ -0,0 +1,2 @@ +To that end, we aim at having a framework mismatch smaller than +1e-5 at all stages of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_152.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..33459b9b881ab98c14c06bc48fb9a41bac275850 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_152.txt @@ -0,0 +1 @@ +As in other numerical problems, the devil is in the details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_153.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf3b2bfe043ceeaacf3eddc150a5d615dea00159 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_153.txt @@ -0,0 +1,2 @@ +And as in any detail-oriented craft, the secret +ingredient here is patience. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_154.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf15448b11c8c0ad440d37187c7255f8d485c63c --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_154.txt @@ -0,0 +1,2 @@ +Here is our suggested workflow for when you come across this type of issues: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_155.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbd13d06e50c783df9b21679f57b95a684677e2b --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_155.txt @@ -0,0 +1 @@ +Locate the source of mismatches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_156.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..bef5f891b90fcd5e4702cd511930156ac6d7062a --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_156.txt @@ -0,0 +1,2 @@ +The model you're converting probably has near identical inner variables up to a + certain point. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_157.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..11685f04972314f12213b255048c83e0587b5809 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_157.txt @@ -0,0 +1,2 @@ +Place breakpoint() statements in the two frameworks' architectures, and compare the values of the + numerical variables in a top-down fashion until you find the source of the problems. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_158.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_158.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_159.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..27d44c9b92977cf67e82cceb675e52d986abe855 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_159.txt @@ -0,0 +1 @@ +Now that you've pinpointed the source of the issue, get in touch with the 🤗 Transformers team. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_160.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b6d1131978c50ebd8ad015d711b88c603143f50 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_160.txt @@ -0,0 +1,2 @@ +It is possible + that we've seen a similar problem before and can promptly provide a solution. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_161.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b66534063c90e5629de81fee53c0d69252208ae --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_161.txt @@ -0,0 +1,2 @@ +As a fallback, scan popular pages + like StackOverflow and GitHub issues. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_162.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_162.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_163.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..6893531793829b173e62ed2cc1195f4c2e74e4e6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_163.txt @@ -0,0 +1 @@ +If there is no solution in sight, it means you'll have to go deeper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_164.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3a0c59d84c711d953ad9a559dd024c134767cfd --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_164.txt @@ -0,0 +1,2 @@ +The good news is that you've located the + issue, so you can focus on the problematic instruction, abstracting away the rest of the model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_165.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..45722ca60a61bbeb8c8827ad9adc0703ff224e02 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_165.txt @@ -0,0 +1,2 @@ +The bad news is + that you'll have to venture into the source implementation of said instruction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_166.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..721e86a3c84142d872f32ffbe3806962053001a4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_166.txt @@ -0,0 +1,2 @@ +In some cases, you might find an + issue with a reference implementation - don't abstain from opening an issue in the upstream repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_167.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..410f5330af120bda18889a8a7e56a1d1722a6691 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_167.txt @@ -0,0 +1 @@ +In some cases, in discussion with the 🤗 Transformers team, we might find that fixing the mismatch is infeasible. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_168.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..b418062b60e09371f87b90c7abbf5c3b2146f670 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_168.txt @@ -0,0 +1,2 @@ +When the mismatch is very small in the output layers of the model (but potentially large in the hidden states), we +might decide to ignore it in favor of distributing the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_169.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..4325d2703008db2c37bc70a51f9cbe3f368e6afe --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_169.txt @@ -0,0 +1,2 @@ +The pt-to-tf CLI mentioned above has a --max-error +flag to override the error message at weight conversion time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_47.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc6fa396deda74b600d25626c1b2010f12e1c5b5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_47.txt @@ -0,0 +1,2 @@ +This will +be your TensorFlow model file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_48.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8988c03a4e6f202d69bf4f2e2682a5fd5dc8dd8 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_48.txt @@ -0,0 +1,3 @@ +Push the changes to your account using: + +git add . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_49.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4b29c5882912029fe8daf9df74de1e4e5db6f78 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_49.txt @@ -0,0 +1,4 @@ +git commit -m "initial commit" +git push -u origin add_tf_brand_new_bert + +Once you are satisfied, go to the webpage of your fork on GitHub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_50.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..a75556052af8e2c70205a6902f1b28b7aa5eb5f5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_50.txt @@ -0,0 +1 @@ +Click on “Pull requestâ€. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_51.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d0b1892d3343828da4f6090d9709dfe62779eb1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_51.txt @@ -0,0 +1,3 @@ +Make sure to add the + GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for + future changes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_52.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..db3be93eee9eb5990852f4089e0fd533d85a1ef4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_52.txt @@ -0,0 +1 @@ +Change the PR into a draft by clicking on “Convert to draft†on the right of the GitHub pull request web page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_53.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce7cd94d26119e111d7d141c76ab9b318847757c --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_53.txt @@ -0,0 +1 @@ +Now you have set up a development environment to port BrandNewBert to TensorFlow in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_54.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_54.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_55.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb54f07b41b6d05c6fc647b9c827880ea0958716 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_55.txt @@ -0,0 +1,2 @@ +(Optional) Understand theoretical aspects and the existing implementation +You should take some time to read BrandNewBert's paper, if such descriptive work exists. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_56.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd4be4688023f1f2e3249bdb9bea1fd5c6fd6fe4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_56.txt @@ -0,0 +1,2 @@ +There might be large +sections of the paper that are difficult to understand. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_57.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fb9943803b4b684b35e49675ea115a4a755b52b --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_57.txt @@ -0,0 +1 @@ +If this is the case, this is fine - don't worry! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_58.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b03ee98a9ea6eb018ac250c052b437ca5d53f32 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_58.txt @@ -0,0 +1,3 @@ +The goal is +not to get a deep theoretical understanding of the paper, but to extract the necessary information required to +effectively re-implement the model in 🤗 Transformers using TensorFlow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_59.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dd8ca70a28442f2dd4703003fd9060213554c27 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_59.txt @@ -0,0 +1,3 @@ +That being said, you don't have to spend too +much time on the theoretical aspects, but rather focus on the practical ones, namely the existing model documentation +page (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_60.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..4273a7e53f73a3eba3de56b8d96c6f0dd2313f6a --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_60.txt @@ -0,0 +1 @@ +model docs for BERT). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_61.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..339a9fd947e39123c6fd823bac238a878cac693a --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_61.txt @@ -0,0 +1,2 @@ +After you've grasped the basics of the models you are about to implement, it's important to understand the existing +implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_62.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..00e345c5327d0a58907691051d325823776ad6d0 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_62.txt @@ -0,0 +1,2 @@ +This is a great chance to confirm that a working implementation matches your expectations for the +model, as well as to foresee technical challenges on the TensorFlow side. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_63.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..95ab810c7e4a4edad10b50f22763a94fe7ed8e88 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_63.txt @@ -0,0 +1 @@ +It's perfectly natural that you feel overwhelmed with the amount of information that you've just absorbed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_64.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e795c308bbb6af0b180764333bdd09b339db90 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_64.txt @@ -0,0 +1,2 @@ +It is +definitely not a requirement that you understand all facets of the model at this stage. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_65.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d22be201424e2ef5afa724c0084415ab06f5ee6 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_65.txt @@ -0,0 +1,2 @@ +Nevertheless, we highly +encourage you to clear any pressing questions in our forum. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_66.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd249e29f9fefd5ef6445828f1394f228bd97f1 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_66.txt @@ -0,0 +1 @@ +4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_67.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3e24f0512d350d397c8e0454fbff4db2c12f25d --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_67.txt @@ -0,0 +1,2 @@ +Model implementation +Now it's time to finally start coding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_68.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3e5514feaa37633cf2d19f88ca4df48b315c9d5 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_68.txt @@ -0,0 +1,3 @@ +Our suggested starting point is the PyTorch file itself: copy the contents of +modeling_brand_new_bert.py inside src/transformers/models/brand_new_bert/ into +modeling_tf_brand_new_bert.py. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_69.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..640135098aa5ff474bce21bb3cc339488def11a3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_69.txt @@ -0,0 +1,3 @@ +The goal of this section is to modify the file and update the import structure of +🤗 Transformers such that you can import TFBrandNewBert and +TFBrandNewBert.from_pretrained(model_repo, from_pt=True) successfully loads a working TensorFlow BrandNewBert model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_70.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..5891c283f03324c301e6d6856f2ef370d1d43f3d --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_70.txt @@ -0,0 +1 @@ +Sadly, there is no prescription to convert a PyTorch model into TensorFlow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_71.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..320fce569ebead266006dfd2aac1488b0cc95cdc --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_71.txt @@ -0,0 +1,3 @@ +You can, however, follow our selection of +tips to make the process as smooth as possible: +- Prepend TF to the name of all classes (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_72.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..429544a09708daeb20fa1a2105d7f39d92841732 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_72.txt @@ -0,0 +1 @@ +BrandNewBert becomes TFBrandNewBert). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_73.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9b25707e4da6cd403245a2dc5a3a026a6212a27 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_73.txt @@ -0,0 +1 @@ +- Most PyTorch operations have a direct TensorFlow replacement. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_74.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..be36cfaf789b1a6e8110567127a88535e27c97fe --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_74.txt @@ -0,0 +1,2 @@ +For example, torch.nn.Linear corresponds to + tf.keras.layers.Dense, torch.nn.Dropout corresponds to tf.keras.layers.Dropout, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_75.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..d943ab82e12418c8cbf97aa9c18bb30172a9437a --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_75.txt @@ -0,0 +1,3 @@ +If you're not sure + about a specific operation, you can use the TensorFlow documentation + or the PyTorch documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_76.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a7680f70747d0de34a21bdf233f8a8454205d2d --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_76.txt @@ -0,0 +1 @@ +- Look for patterns in the 🤗 Transformers codebase. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_77.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..b51d69c475ca08505b8ef0a27aa3ff32ff2e1cb7 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_77.txt @@ -0,0 +1,2 @@ +If you come across a certain operation that doesn't have a direct + replacement, the odds are that someone else already had the same problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_78.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5b9351e084b7afd79958dd7df1243e514e1d492 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_78.txt @@ -0,0 +1 @@ +- By default, keep the same variable names and structure as in PyTorch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_79.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..b86574bc9428b228f2e68fe7ca18232e2c8c9546 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_79.txt @@ -0,0 +1,2 @@ +This will make it easier to debug, track + issues, and add fixes down the line. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_80.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..858de50ccbf44a02f5025d4a17e1148d13d34fdd --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_80.txt @@ -0,0 +1 @@ +- Some layers have different default values in each framework. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_81.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f596546bdd1cc848062eb6d5cb2280011b8dba4 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_81.txt @@ -0,0 +1,3 @@ +A notable example is the batch normalization layer's + epsilon (1e-5 in PyTorch + and 1e-3 in TensorFlow). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_82.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..692bdc68baac22250fe518b9f1d663f8a7abf2a2 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_82.txt @@ -0,0 +1 @@ +Double-check the documentation! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_83.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..75bf9db310e6f4b2ebafe582710e328696c7d65e --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_83.txt @@ -0,0 +1 @@ +- PyTorch's nn.Parameter variables typically need to be initialized within TF Layer's build(). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_84.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e0fa3e143ff050bb62c049e5b04987585ba7db3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_84.txt @@ -0,0 +1,5 @@ +See the following + example: PyTorch / + TensorFlow +- If the PyTorch model has a #copied from on top of a function, the odds are that your TensorFlow model can also + borrow that function from the architecture it was copied from, assuming it has a TensorFlow architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_85.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..abc557e79fc5b2d61adcee3623c60a10d0b9b531 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_85.txt @@ -0,0 +1,2 @@ +- Assigning the name attribute correctly in TensorFlow functions is critical to do the from_pt=True weight + cross-loading. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_86.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..89774d055bf7b40de31219eaf69fc2e64db05467 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_86.txt @@ -0,0 +1 @@ +name is almost always the name of the corresponding variable in the PyTorch code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_87.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..32270da4e6231232a5702a9f1f439f609f47e94a --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_87.txt @@ -0,0 +1,2 @@ +If name is not + properly set, you will see it in the error message when loading the model weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_88.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..d711c59de32c29a9a04bab85a9283d7bd3b69fe9 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_88.txt @@ -0,0 +1,2 @@ +- The logic of the base model class, BrandNewBertModel, will actually reside in TFBrandNewBertMainLayer, a Keras + layer subclass (example). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_89.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..00daa07ab9ad808d639a95cc7001912890fd5ca3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_89.txt @@ -0,0 +1 @@ +TFBrandNewBertModel will simply be a wrapper around this layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_90.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..fea6610c1a3c1d162d0fee070f2a89e6ecc4e539 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_90.txt @@ -0,0 +1 @@ +- Keras models need to be built in order to load pretrained weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_91.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..f640b836da1f14834b428604542a85b59f4b2d97 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_91.txt @@ -0,0 +1,3 @@ +For that reason, TFBrandNewBertPreTrainedModel + will need to hold an example of inputs to the model, the dummy_inputs + (example). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_92.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..de274a11b0d883dc7248af0dad7ccf6460061d30 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_92.txt @@ -0,0 +1 @@ +- If you get stuck, ask for help - we're here to help you! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_93.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e39528a38ef713a67b6d5979cbabefd2ade0d35 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_93.txt @@ -0,0 +1,3 @@ +🤗 +In addition to the model file itself, you will also need to add the pointers to the model classes and related +documentation pages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_94.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ce925560eba54b209b1ffd353e524c52f4934f7 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_94.txt @@ -0,0 +1,2 @@ +You can complete this part entirely following the patterns in other PRs +(example). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_95.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..7636e39dc0e4bdd4449318de6944c49d1ff070cd --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_95.txt @@ -0,0 +1,12 @@ +Here's a list of the needed manual +changes: +- Include all public classes of BrandNewBert in src/transformers/__init__.py +- Add BrandNewBert classes to the corresponding Auto classes in src/transformers/models/auto/modeling_tf_auto.py +- Add the lazy loading classes related to BrandNewBert in src/transformers/utils/dummy_tf_objects.py +- Update the import structures for the public classes in src/transformers/models/brand_new_bert/__init__.py +- Add the documentation pointers to the public methods of BrandNewBert in docs/source/en/model_doc/brand_new_bert.md +- Add yourself to the list of contributors to BrandNewBert in docs/source/en/model_doc/brand_new_bert.md +- Finally, add a green tick ✅ to the TensorFlow column of BrandNewBert in docs/source/en/index.md +When you're happy with your implementation, run the following checklist to confirm that your model architecture is +ready: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_96.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..00f64d8ee504b285abf8ce92a4bd2f7b46d1c222 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_96.txt @@ -0,0 +1 @@ +All layers that behave differently at train time (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_97.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..20972499389086bf2e3a4ee1c136acfc4d3764a7 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_97.txt @@ -0,0 +1,3 @@ +Dropout) are called with a training argument, which is +propagated all the way from the top-level classes +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_98.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..5044c6f9f51923c6f64a9720d05054be31a9fa79 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_98.txt @@ -0,0 +1,2 @@ +You have used #copied from whenever possible +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_add_tensorflow_model/chunk_99.txt b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..0dcfd78b6063ed2137a896729e8e06bfa57de6e3 --- /dev/null +++ b/chunked/content_aware_chunking/_add_tensorflow_model/chunk_99.txt @@ -0,0 +1,2 @@ +TFBrandNewBertMainLayer and all classes that use it have their call function decorated with @unpack_inputs +4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_attention/chunk_10.txt b/chunked/content_aware_chunking/_attention/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad907bafe20835ee3a61aa5b296dc993745a0e93 --- /dev/null +++ b/chunked/content_aware_chunking/_attention/chunk_10.txt @@ -0,0 +1 @@ +is enough to take action for a given token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_attention/chunk_11.txt b/chunked/content_aware_chunking/_attention/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..99693c67e975266b83f95e825830e6dd5b2fe569 --- /dev/null +++ b/chunked/content_aware_chunking/_attention/chunk_11.txt @@ -0,0 +1,3 @@ +Also, by stacking attention layers that have a small +window, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a +representation of the whole sentence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_attention/chunk_12.txt b/chunked/content_aware_chunking/_attention/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..4acffa1a68406cf55b9498bcecc9751b6df1b248 --- /dev/null +++ b/chunked/content_aware_chunking/_attention/chunk_12.txt @@ -0,0 +1,3 @@ +Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access +all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in +their local window). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_attention/chunk_13.txt b/chunked/content_aware_chunking/_attention/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..857d8149703dbbd57cab0bff93619f7cda5b8285 --- /dev/null +++ b/chunked/content_aware_chunking/_attention/chunk_13.txt @@ -0,0 +1,4 @@ +This is shown in Figure 2d of the paper, see below for a sample attention mask: + +Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence +length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_attention/chunk_14.txt b/chunked/content_aware_chunking/_attention/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a45824410f1f6d30e0c89008aba8e67f044f085c --- /dev/null +++ b/chunked/content_aware_chunking/_attention/chunk_14.txt @@ -0,0 +1,5 @@ +Other tricks +Axial positional encodings +Reformer uses axial positional encodings: in traditional transformer models, the positional encoding +E is a matrix of size \(l\) by \(d\), \(l\) being the sequence length and \(d\) the dimension of the +hidden state. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_attention/chunk_15.txt b/chunked/content_aware_chunking/_attention/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b9647f01226986ba87c4f4c98bf8d8cd07ca007 --- /dev/null +++ b/chunked/content_aware_chunking/_attention/chunk_15.txt @@ -0,0 +1 @@ +If you have very long texts, this matrix can be huge and take way too much space on the GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_attention/chunk_16.txt b/chunked/content_aware_chunking/_attention/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8ac4c60c99b81a16696add15573fda603b3cd01 --- /dev/null +++ b/chunked/content_aware_chunking/_attention/chunk_16.txt @@ -0,0 +1,4 @@ +To alleviate +that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and E2, with +dimensions \(l_{1} \times d_{1}\) and \(l_{2} \times d_{2}\), such that \(l_{1} \times l_{2} = l\) and +\(d_{1} + d_{2} = d\) (with the product for the lengths, this ends up being way smaller). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_attention/chunk_17.txt b/chunked/content_aware_chunking/_attention/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..caa542befdad14c5341d2223093127e6e000ca35 --- /dev/null +++ b/chunked/content_aware_chunking/_attention/chunk_17.txt @@ -0,0 +1,3 @@ +The embedding for time +step \(j\) in E is obtained by concatenating the embeddings for timestep \(j \% l1\) in E1 and \(j // l1\) +in E2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_attention/chunk_7.txt b/chunked/content_aware_chunking/_attention/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1604f7ae8a068d245bebbcf74deaca0c8afa048 --- /dev/null +++ b/chunked/content_aware_chunking/_attention/chunk_7.txt @@ -0,0 +1,3 @@ +The attention mask is +modified to mask the current token (except at the first position), because it will give a query and a key equal (so +very similar to each other). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_attention/chunk_8.txt b/chunked/content_aware_chunking/_attention/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..441d7ec1b84de8cd660bee75ac934afa0fb41f00 --- /dev/null +++ b/chunked/content_aware_chunking/_attention/chunk_8.txt @@ -0,0 +1,2 @@ +Since the hash can be a bit random, several hash functions are used in practice +(determined by a n_rounds parameter) and then are averaged together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_attention/chunk_9.txt b/chunked/content_aware_chunking/_attention/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..2bb75c6010096060aecdca467b46e22ddface1e9 --- /dev/null +++ b/chunked/content_aware_chunking/_attention/chunk_9.txt @@ -0,0 +1,3 @@ +Local attention +Longformer uses local attention: often, the local context (e.g., what are the two tokens to the +left and right?) \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_13.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..f43cf502f5fbca13d1ec99b89a83803339916ed2 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_13.txt @@ -0,0 +1 @@ +A tokenizer converts your input into a format that can be processed by the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_14.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..0961b799f1641532db601990962543127eb5367d --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_14.txt @@ -0,0 +1,8 @@ +Load a tokenizer with [AutoTokenizer.from_pretrained]: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + +Then tokenize your input as shown below: + +sequence = "In a hole in the ground there lived a hobbit." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_15.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..c61a648b29e77d369c8e056b01c79d951b4e08bf --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_15.txt @@ -0,0 +1,7 @@ +print(tokenizer(sequence)) +{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} + +AutoImageProcessor +For vision tasks, an image processor processes the image into the correct input format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_16.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c8d054bf6cf44dd1f17284682dbed3f6edf0418 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_16.txt @@ -0,0 +1,6 @@ +from transformers import AutoImageProcessor +image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") + +AutoBackbone + +A Swin backbone with multiple stages for outputting a feature map. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_17.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9403e2af42d127f3a6b639b7d16d460dec5bc57 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_17.txt @@ -0,0 +1 @@ +The [AutoBackbone] lets you use pretrained models as backbones to get feature maps from different stages of the backbone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_18.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4d860420efc1c51baa993b4104739d20fbd6d4b --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_18.txt @@ -0,0 +1,6 @@ +You should specify one of the following parameters in [~PretrainedConfig.from_pretrained]: + +out_indices is the index of the layer you'd like to get the feature map from +out_features is the name of the layer you'd like to get the feature map from + +These parameters can be used interchangeably, but if you use both, make sure they're aligned with each other! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_19.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe14065b91b53fd4ffd2a4151da9437f5f3c69fe --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_19.txt @@ -0,0 +1 @@ +If you don't pass any of these parameters, the backbone returns the feature map from the last layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_20.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..83fd4403602296f0145b0ac99a262fcedb602ce3 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_20.txt @@ -0,0 +1 @@ +A feature map from the first stage of the backbone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_21.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..64a8b9d28bb2ef1d4882b64b9c5b942785ed801d --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_21.txt @@ -0,0 +1 @@ +The patch partition refers to the model stem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_22.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1ab6209a26c53f0e5367e47c378e7ed18795a83 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_22.txt @@ -0,0 +1,21 @@ +For example, in the above diagram, to return the feature map from the first stage of the Swin backbone, you can set out_indices=(1,): + +from transformers import AutoImageProcessor, AutoBackbone +import torch +from PIL import Image +import requests +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") +model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) +inputs = processor(image, return_tensors="pt") +outputs = model(**inputs) +feature_maps = outputs.feature_maps + +Now you can access the feature_maps object from the first stage of the backbone: + +list(feature_maps[0].shape) +[1, 96, 56, 56] + +AutoFeatureExtractor +For audio tasks, a feature extractor processes the audio signal the correct input format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_23.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..24186810a73f24c541f6153d611a7799e594886d --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_23.txt @@ -0,0 +1,9 @@ +Load a feature extractor with [AutoFeatureExtractor.from_pretrained]: + +from transformers import AutoFeatureExtractor +feature_extractor = AutoFeatureExtractor.from_pretrained( + "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" + ) + +AutoProcessor +Multimodal tasks require a processor that combines two types of preprocessing tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_24.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..98881320095453132a81110693653f23a5b56280 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_24.txt @@ -0,0 +1 @@ +For example, the LayoutLMV2 model requires an image processor to handle images and a tokenizer to handle text; a processor combines both of them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_25.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..f22fe7178f5505a73549c427a03feacd3701fa9d --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_25.txt @@ -0,0 +1,8 @@ +Load a processor with [AutoProcessor.from_pretrained]: + +from transformers import AutoProcessor +processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") + +AutoModel + +The AutoModelFor classes let you load a pretrained model for a given task (see here for a complete list of available tasks). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_26.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..83f19848c1a29b4cdee1dc3f35e1aada85956468 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_26.txt @@ -0,0 +1,11 @@ +For example, load a model for sequence classification with [AutoModelForSequenceClassification.from_pretrained]: + +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + +Easily reuse the same checkpoint to load an architecture for a different task: + +from transformers import AutoModelForTokenClassification +model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased") + +For PyTorch models, the from_pretrained() method uses torch.load() which internally uses pickle and is known to be insecure. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_27.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d238751152d91f47dac151dac30f570f8049a391 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_27.txt @@ -0,0 +1 @@ +In general, never load a model that could have come from an untrusted source, or that could have been tampered with. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_28.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3aeffa7647b546d20114cb6bd9389dcf240450a --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_28.txt @@ -0,0 +1 @@ +This security risk is partially mitigated for public models hosted on the Hugging Face Hub, which are scanned for malware at each commit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_29.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..b621b870ee24027af707349b1b84f98ac5df918e --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_29.txt @@ -0,0 +1 @@ +See the Hub documentation for best practices like signed commit verification with GPG. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_30.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..d627d595f168c06f33a8e6b933744f277074d1c2 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_30.txt @@ -0,0 +1 @@ +TensorFlow and Flax checkpoints are not affected, and can be loaded within PyTorch architectures using the from_tf and from_flax kwargs for the from_pretrained method to circumvent this issue. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_31.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..af09b10999348dada07ce540eaf0970f398efcc5 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_31.txt @@ -0,0 +1 @@ +Generally, we recommend using the AutoTokenizer class and the AutoModelFor class to load pretrained instances of models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_32.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..828720267f6af0e92162369230eb8e6ae51285a6 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_32.txt @@ -0,0 +1 @@ +This will ensure you load the correct architecture every time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_33.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..1356b63f0523b8a7152097df37caf3717af89282 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_33.txt @@ -0,0 +1 @@ +In the next tutorial, learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_34.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d05e5f94e7b75d50f1922ffffb8ca4e3b58ed8a --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_34.txt @@ -0,0 +1 @@ +Finally, the TFAutoModelFor classes let you load a pretrained model for a given task (see here for a complete list of available tasks). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_35.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5f3a1ad0b2f9502e0a1c3a75e569d88f7642221 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_35.txt @@ -0,0 +1,11 @@ +For example, load a model for sequence classification with [TFAutoModelForSequenceClassification.from_pretrained]: + +from transformers import TFAutoModelForSequenceClassification +model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + +Easily reuse the same checkpoint to load an architecture for a different task: + +from transformers import TFAutoModelForTokenClassification +model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased") + +Generally, we recommend using the AutoTokenizer class and the TFAutoModelFor class to load pretrained instances of models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_36.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..828720267f6af0e92162369230eb8e6ae51285a6 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_36.txt @@ -0,0 +1 @@ +This will ensure you load the correct architecture every time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_autoclass_tutorial/chunk_37.txt b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..1356b63f0523b8a7152097df37caf3717af89282 --- /dev/null +++ b/chunked/content_aware_chunking/_autoclass_tutorial/chunk_37.txt @@ -0,0 +1 @@ +In the next tutorial, learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_15.txt b/chunked/content_aware_chunking/_benchmarks/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6624891a990c4cdd42997cc37b111e2ffecdbe65 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_15.txt @@ -0,0 +1,45 @@ +results = benchmark.run() +print(results) +==================== INFERENCE - SPEED - RESULT ==================== + +Model Name Batch Size Seq Length Time in s +google-bert/bert-base-uncased 8 8 0.006 +google-bert/bert-base-uncased 8 32 0.006 +google-bert/bert-base-uncased 8 128 0.018 +google-bert/bert-base-uncased 8 512 0.088 + +==================== INFERENCE - MEMORY - RESULT ==================== +Model Name Batch Size Seq Length Memory in MB +google-bert/bert-base-uncased 8 8 1227 +google-bert/bert-base-uncased 8 32 1281 +google-bert/bert-base-uncased 8 128 1307 +google-bert/bert-base-uncased 8 512 1539 + +==================== ENVIRONMENT INFORMATION ==================== + +transformers_version: 2.11.0 +framework: PyTorch +use_torchscript: False +framework_version: 1.4.0 +python_version: 3.6.10 +system: Linux +cpu: x86_64 +architecture: 64bit +date: 2020-06-29 +time: 08:58:43.371351 +fp16: False +use_multiprocessing: True +only_pretrain_model: False +cpu_ram_mb: 32088 +use_gpu: True +num_gpus: 1 +gpu: TITAN RTX +gpu_ram_mb: 24217 +gpu_power_watts: 280.0 +gpu_performance_state: 2 +use_tpu: False + +bash +python examples/tensorflow/benchmarking/run_benchmark_tf.py --help + +An instantiated benchmark object can then simply be run by calling benchmark.run(). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_16.txt b/chunked/content_aware_chunking/_benchmarks/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a025746209c543024ac41f2ac8d130ef3c9546f --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_16.txt @@ -0,0 +1,44 @@ +results = benchmark.run() +print(results) +results = benchmark.run() +print(results) +==================== INFERENCE - SPEED - RESULT ==================== + +Model Name Batch Size Seq Length Time in s +google-bert/bert-base-uncased 8 8 0.005 +google-bert/bert-base-uncased 8 32 0.008 +google-bert/bert-base-uncased 8 128 0.022 +google-bert/bert-base-uncased 8 512 0.105 + +==================== INFERENCE - MEMORY - RESULT ==================== +Model Name Batch Size Seq Length Memory in MB +google-bert/bert-base-uncased 8 8 1330 +google-bert/bert-base-uncased 8 32 1330 +google-bert/bert-base-uncased 8 128 1330 +google-bert/bert-base-uncased 8 512 1770 + +==================== ENVIRONMENT INFORMATION ==================== + +transformers_version: 2.11.0 +framework: Tensorflow +use_xla: False +framework_version: 2.2.0 +python_version: 3.6.10 +system: Linux +cpu: x86_64 +architecture: 64bit +date: 2020-06-29 +time: 09:26:35.617317 +fp16: False +use_multiprocessing: True +only_pretrain_model: False +cpu_ram_mb: 32088 +use_gpu: True +num_gpus: 1 +gpu: TITAN RTX +gpu_ram_mb: 24217 +gpu_power_watts: 280.0 +gpu_performance_state: 2 +use_tpu: False + +By default, the time and the required memory for inference are benchmarked. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_17.txt b/chunked/content_aware_chunking/_benchmarks/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d1c74afd87d993853915c9ba9ef1e9fc9021d57 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_17.txt @@ -0,0 +1,2 @@ +In the example output above the first +two sections show the result corresponding to inference time and inference memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_18.txt b/chunked/content_aware_chunking/_benchmarks/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fa90b2da730d6a8b98e912d3c9d63c17cb47573 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_18.txt @@ -0,0 +1,2 @@ +In addition, all relevant +information about the computing environment, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_19.txt b/chunked/content_aware_chunking/_benchmarks/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1ae357509a583442d6a6d27817ef8b145f5ab62 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_19.txt @@ -0,0 +1,2 @@ +the GPU type, the system, the library versions, etc are printed +out in the third section under ENVIRONMENT INFORMATION. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_20.txt b/chunked/content_aware_chunking/_benchmarks/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca40f887676ec7d4963640db1ded9b2ff13d26c1 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_20.txt @@ -0,0 +1,3 @@ +This information can optionally be saved in a .csv file +when adding the argument save_to_csv=True to [PyTorchBenchmarkArguments] and +[TensorFlowBenchmarkArguments] respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_21.txt b/chunked/content_aware_chunking/_benchmarks/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..176d5ac743d279a680d16d6e49bc23c7a9d64e8c --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_21.txt @@ -0,0 +1,2 @@ +In this case, every section is saved in a separate +.csv file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_22.txt b/chunked/content_aware_chunking/_benchmarks/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa31670521080b114c392bf6ce2193a23f721add --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_22.txt @@ -0,0 +1 @@ +The path to each .csv file can optionally be defined via the argument data classes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_23.txt b/chunked/content_aware_chunking/_benchmarks/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb1fec35ecf1e619e022bff74e6669c40423e889 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_23.txt @@ -0,0 +1 @@ +Instead of benchmarking pre-trained models via their model identifier, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_24.txt b/chunked/content_aware_chunking/_benchmarks/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..3303d7c89b7dbde7b9cf06ff02e08f4f46088fae --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_24.txt @@ -0,0 +1,2 @@ +google-bert/bert-base-uncased, the user can +alternatively benchmark an arbitrary configuration of any available model class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_25.txt b/chunked/content_aware_chunking/_benchmarks/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3ef13ab205ed8ccfd60368bc9ef728f0e663f68 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_25.txt @@ -0,0 +1,2 @@ +In this case, a list of +configurations must be inserted with the benchmark args as follows. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_26.txt b/chunked/content_aware_chunking/_benchmarks/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9c71f5f1176501b41692fbc3b48f80f6dff3b53 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_26.txt @@ -0,0 +1,133 @@ +from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig +args = PyTorchBenchmarkArguments( + models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] + ) +config_base = BertConfig() +config_384_hid = BertConfig(hidden_size=384) +config_6_lay = BertConfig(num_hidden_layers=6) +benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== + +Model Name Batch Size Seq Length Time in s +bert-base 8 128 0.006 +bert-base 8 512 0.006 +bert-base 8 128 0.018 +bert-base 8 512 0.088 +bert-384-hid 8 8 0.006 +bert-384-hid 8 32 0.006 +bert-384-hid 8 128 0.011 +bert-384-hid 8 512 0.054 +bert-6-lay 8 8 0.003 +bert-6-lay 8 32 0.004 +bert-6-lay 8 128 0.009 +bert-6-lay 8 512 0.044 + +==================== INFERENCE - MEMORY - RESULT ==================== +Model Name Batch Size Seq Length Memory in MB +bert-base 8 8 1277 +bert-base 8 32 1281 +bert-base 8 128 1307 +bert-base 8 512 1539 +bert-384-hid 8 8 1005 +bert-384-hid 8 32 1027 +bert-384-hid 8 128 1035 +bert-384-hid 8 512 1255 +bert-6-lay 8 8 1097 +bert-6-lay 8 32 1101 +bert-6-lay 8 128 1127 +bert-6-lay 8 512 1359 + +==================== ENVIRONMENT INFORMATION ==================== + +transformers_version: 2.11.0 +framework: PyTorch +use_torchscript: False +framework_version: 1.4.0 +python_version: 3.6.10 +system: Linux +cpu: x86_64 +architecture: 64bit +date: 2020-06-29 +time: 09:35:25.143267 +fp16: False +use_multiprocessing: True +only_pretrain_model: False +cpu_ram_mb: 32088 +use_gpu: True +num_gpus: 1 +gpu: TITAN RTX +gpu_ram_mb: 24217 +gpu_power_watts: 280.0 +gpu_performance_state: 2 +use_tpu: False + +py + +from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig + +args = TensorFlowBenchmarkArguments( + models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] + ) +config_base = BertConfig() +config_384_hid = BertConfig(hidden_size=384) +config_6_lay = BertConfig(num_hidden_layers=6) +benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== + +Model Name Batch Size Seq Length Time in s +bert-base 8 8 0.005 +bert-base 8 32 0.008 +bert-base 8 128 0.022 +bert-base 8 512 0.106 +bert-384-hid 8 8 0.005 +bert-384-hid 8 32 0.007 +bert-384-hid 8 128 0.018 +bert-384-hid 8 512 0.064 +bert-6-lay 8 8 0.002 +bert-6-lay 8 32 0.003 +bert-6-lay 8 128 0.0011 +bert-6-lay 8 512 0.074 + +==================== INFERENCE - MEMORY - RESULT ==================== +Model Name Batch Size Seq Length Memory in MB +bert-base 8 8 1330 +bert-base 8 32 1330 +bert-base 8 128 1330 +bert-base 8 512 1770 +bert-384-hid 8 8 1330 +bert-384-hid 8 32 1330 +bert-384-hid 8 128 1330 +bert-384-hid 8 512 1540 +bert-6-lay 8 8 1330 +bert-6-lay 8 32 1330 +bert-6-lay 8 128 1330 +bert-6-lay 8 512 1540 + +==================== ENVIRONMENT INFORMATION ==================== + +transformers_version: 2.11.0 +framework: Tensorflow +use_xla: False +framework_version: 2.2.0 +python_version: 3.6.10 +system: Linux +cpu: x86_64 +architecture: 64bit +date: 2020-06-29 +time: 09:38:15.487125 +fp16: False +use_multiprocessing: True +only_pretrain_model: False +cpu_ram_mb: 32088 +use_gpu: True +num_gpus: 1 +gpu: TITAN RTX +gpu_ram_mb: 24217 +gpu_power_watts: 280.0 +gpu_performance_state: 2 +use_tpu: False + +Again, inference time and required memory for inference are measured, but this time for customized configurations +of the BertModel class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_27.txt b/chunked/content_aware_chunking/_benchmarks/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca087fd2ee84b11dd01016313f2c5fd0b798a82e --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_27.txt @@ -0,0 +1,2 @@ +This feature can especially be helpful when deciding for which configuration the model +should be trained. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_28.txt b/chunked/content_aware_chunking/_benchmarks/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..158e3739f0bffd36dd0d6e0cd51d3abb39082408 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_28.txt @@ -0,0 +1,2 @@ +Benchmark best practices +This section lists a couple of best practices one should be aware of when benchmarking a model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_29.txt b/chunked/content_aware_chunking/_benchmarks/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c12848ec134b153883880fcc55f1e5afa3af6a1 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_29.txt @@ -0,0 +1 @@ +Currently, only single device benchmarking is supported. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_30.txt b/chunked/content_aware_chunking/_benchmarks/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..910a3c7a8729ed1f88ef39aff6458e376490e26f --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_30.txt @@ -0,0 +1,3 @@ +When benchmarking on GPU, it is recommended that the user + specifies on which device the code should be run by setting the CUDA_VISIBLE_DEVICES environment variable in the + shell, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_31.txt b/chunked/content_aware_chunking/_benchmarks/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec0e80c1b85972171508a36374f8a8d7abb74f4c --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_31.txt @@ -0,0 +1 @@ +export CUDA_VISIBLE_DEVICES=0 before running the code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_32.txt b/chunked/content_aware_chunking/_benchmarks/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..1985650a972923322b9f95ac73bdda9ce5739dab --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_32.txt @@ -0,0 +1 @@ +The option no_multi_processing should only be set to True for testing and debugging. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_33.txt b/chunked/content_aware_chunking/_benchmarks/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..cad300a9231cdd48f6fd9bba8216cb876b6245b6 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_33.txt @@ -0,0 +1,3 @@ +To ensure accurate + memory measurement it is recommended to run each memory benchmark in a separate process by making sure + no_multi_processing is set to True. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_34.txt b/chunked/content_aware_chunking/_benchmarks/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4b4559962f69e5acb21864349986b911e0b37fb --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_34.txt @@ -0,0 +1 @@ +One should always state the environment information when sharing the results of a model benchmark. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_35.txt b/chunked/content_aware_chunking/_benchmarks/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad4194b05d120ef8b0625c0f28cefaa19e3bd43a --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_35.txt @@ -0,0 +1,3 @@ +Results can vary + heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very + useful for the community. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_36.txt b/chunked/content_aware_chunking/_benchmarks/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3d83ff6d1dd256a7449880caabad6ca3b04acff --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_36.txt @@ -0,0 +1,3 @@ +Sharing your benchmark +Previously all available core models (10 at the time) have been benchmarked for inference time, across many different +settings: using PyTorch, with and without TorchScript, using TensorFlow, with and without XLA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_37.txt b/chunked/content_aware_chunking/_benchmarks/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..62ad659ab08a20da2f5132d3d45c9aa580abd102 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_37.txt @@ -0,0 +1,2 @@ +All of those tests were +done across CPUs (except for TensorFlow XLA) and GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_38.txt b/chunked/content_aware_chunking/_benchmarks/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a5e3f33d47dbb5996f7cbbac6f7251ae87a3d63 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_38.txt @@ -0,0 +1,2 @@ +The approach is detailed in the following blogpost and the results are +available here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_39.txt b/chunked/content_aware_chunking/_benchmarks/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..9856375638a9cd3a3fcd225a21d5eda572e48cbb --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_39.txt @@ -0,0 +1,3 @@ +With the new benchmark tools, it is easier than ever to share your benchmark results with the community + +PyTorch Benchmarking Results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_benchmarks/chunk_40.txt b/chunked/content_aware_chunking/_benchmarks/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..a59e42fe26b7b2e616dd25c265bbb5027e49de98 --- /dev/null +++ b/chunked/content_aware_chunking/_benchmarks/chunk_40.txt @@ -0,0 +1 @@ +TensorFlow Benchmarking Results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_bertology/chunk_2.txt b/chunked/content_aware_chunking/_bertology/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbc0e0a94f2a026a5d52568af4fd9d199246feac --- /dev/null +++ b/chunked/content_aware_chunking/_bertology/chunk_2.txt @@ -0,0 +1,2 @@ +by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 +What Does BERT Look At? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_bertology/chunk_3.txt b/chunked/content_aware_chunking/_bertology/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0c66c3d9e32ecac7c0eff0e649d09e4dc1dc66c --- /dev/null +++ b/chunked/content_aware_chunking/_bertology/chunk_3.txt @@ -0,0 +1,12 @@ +An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. + Manning: https://arxiv.org/abs/1906.04341 +CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633 + +In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to +help people access the inner representations, mainly adapted from the great work of Paul Michel +(https://arxiv.org/abs/1905.10650): + +accessing all the hidden-states of BERT/GPT/GPT-2, +accessing all the attention weights for each head of BERT/GPT/GPT-2, +retrieving heads output values and gradients to be able to compute head importance score and prune head as explained + in https://arxiv.org/abs/1905.10650. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_bertology/chunk_4.txt b/chunked/content_aware_chunking/_bertology/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..f265fbff154f980ea4b6c5d88136d78324bf867a --- /dev/null +++ b/chunked/content_aware_chunking/_bertology/chunk_4.txt @@ -0,0 +1,2 @@ +To help you understand and use these features, we have added a specific example script: bertology.py while extract information and prune a model pre-trained on +GLUE. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_big_models/chunk_10.txt b/chunked/content_aware_chunking/_big_models/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7af41af1f564d224baae1d815f382effe49da055 --- /dev/null +++ b/chunked/content_aware_chunking/_big_models/chunk_10.txt @@ -0,0 +1,2 @@ +Sharded checkpoints +Since version 4.18.0, model checkpoints that end up taking more than 10GB of space are automatically sharded in smaller pieces. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_big_models/chunk_11.txt b/chunked/content_aware_chunking/_big_models/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d88be6e29ccc395d0e924df7de9030f6a3c3e5a --- /dev/null +++ b/chunked/content_aware_chunking/_big_models/chunk_11.txt @@ -0,0 +1 @@ +In terms of having one single checkpoint when you do model.save_pretrained(save_dir), you will end up with several partial checkpoints (each of which being of size < 10GB) and an index that maps parameter names to the files they are stored in. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_big_models/chunk_12.txt b/chunked/content_aware_chunking/_big_models/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..89e53565779206d7ce99a9528429b7ec6e4416ba --- /dev/null +++ b/chunked/content_aware_chunking/_big_models/chunk_12.txt @@ -0,0 +1 @@ +You can control the maximum size before sharding with the max_shard_size parameter, so for the sake of an example, we'll use a normal-size models with a small shard size: let's take a traditional BERT model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_big_models/chunk_13.txt b/chunked/content_aware_chunking/_big_models/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..279adf22bf4b69262e42d0a3a910ddbacca2afd0 --- /dev/null +++ b/chunked/content_aware_chunking/_big_models/chunk_13.txt @@ -0,0 +1,20 @@ +from transformers import AutoModel +model = AutoModel.from_pretrained("google-bert/bert-base-cased") + +If you save it using [~PreTrainedModel.save_pretrained], you will get a new folder with two files: the config of the model and its weights: + +import os +import tempfile +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + print(sorted(os.listdir(tmp_dir))) +['config.json', 'pytorch_model.bin'] + +Now let's use a maximum shard size of 200MB: + +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="200MB") + print(sorted(os.listdir(tmp_dir))) +['config.json', 'pytorch_model-00001-of-00003.bin', 'pytorch_model-00002-of-00003.bin', 'pytorch_model-00003-of-00003.bin', 'pytorch_model.bin.index.json'] + +On top of the configuration of the model, we see three different weights files, and an index.json file which is our index. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_big_models/chunk_14.txt b/chunked/content_aware_chunking/_big_models/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c7a38e87e6d655fd2026badd3504c8612e4943f --- /dev/null +++ b/chunked/content_aware_chunking/_big_models/chunk_14.txt @@ -0,0 +1,7 @@ +A checkpoint like this can be fully reloaded using the [~PreTrainedModel.from_pretrained] method: + +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="200MB") + new_model = AutoModel.from_pretrained(tmp_dir) + +The main advantage of doing this for big models is that during step 2 of the workflow shown above, each shard of the checkpoint is loaded after the previous one, capping the memory usage in RAM to the model size plus the size of the biggest shard. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_big_models/chunk_15.txt b/chunked/content_aware_chunking/_big_models/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..956cc5afd62cbfb6cf25a10397d57e2741a16051 --- /dev/null +++ b/chunked/content_aware_chunking/_big_models/chunk_15.txt @@ -0,0 +1 @@ +Behind the scenes, the index file is used to determine which keys are in the checkpoint, and where the corresponding weights are stored. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_big_models/chunk_16.txt b/chunked/content_aware_chunking/_big_models/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..212b0e52c4b7a78161975b2a0fc040c3426d5412 --- /dev/null +++ b/chunked/content_aware_chunking/_big_models/chunk_16.txt @@ -0,0 +1,11 @@ +We can load that index like any json and get a dictionary: + +import json +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="200MB") + with open(os.path.join(tmp_dir, "pytorch_model.bin.index.json"), "r") as f: + index = json.load(f) +print(index.keys()) +dict_keys(['metadata', 'weight_map']) + +The metadata just consists of the total size of the model for now. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_big_models/chunk_17.txt b/chunked/content_aware_chunking/_big_models/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..be64db07863276b0e842a5e17835db38da37b244 --- /dev/null +++ b/chunked/content_aware_chunking/_big_models/chunk_17.txt @@ -0,0 +1,21 @@ +We plan to add other information in the future: + +index["metadata"] +{'total_size': 433245184} + +The weights map is the main part of this index, which maps each parameter name (as usually found in a PyTorch model state_dict) to the file it's stored in: + +index["weight_map"] +{'embeddings.LayerNorm.bias': 'pytorch_model-00001-of-00003.bin', + 'embeddings.LayerNorm.weight': 'pytorch_model-00001-of-00003.bin', + + +If you want to directly load such a sharded checkpoint inside a model without using [~PreTrainedModel.from_pretrained] (like you would do model.load_state_dict() for a full checkpoint) you should use [~modeling_utils.load_sharded_checkpoint]: + +from transformers.modeling_utils import load_sharded_checkpoint +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="200MB") + load_sharded_checkpoint(model, tmp_dir) + +Low memory loading +Sharded checkpoints reduce the memory usage during step 2 of the workflow mentioned above, but in order to use that model in a low memory setting, we recommend leveraging our tools based on the Accelerate library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_big_models/chunk_18.txt b/chunked/content_aware_chunking/_big_models/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed6342a302a74de371570cd62848722596549f9f --- /dev/null +++ b/chunked/content_aware_chunking/_big_models/chunk_18.txt @@ -0,0 +1 @@ +Please read the following guide for more information: Large model loading using Accelerate \ No newline at end of file diff --git a/chunked/content_aware_chunking/_big_models/chunk_8.txt b/chunked/content_aware_chunking/_big_models/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f8e6692e212b7973346f4c6541f2c83aeecf085 --- /dev/null +++ b/chunked/content_aware_chunking/_big_models/chunk_8.txt @@ -0,0 +1 @@ +In this guide, we explore the solutions Transformers offer to deal with this issue. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_big_models/chunk_9.txt b/chunked/content_aware_chunking/_big_models/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..c89bc83385ba6dd7256cce269f67903f85601bb6 --- /dev/null +++ b/chunked/content_aware_chunking/_big_models/chunk_9.txt @@ -0,0 +1 @@ +Note that this is an area of active development, so the APIs explained here may change slightly in the future. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_100.txt b/chunked/content_aware_chunking/_chat_templating/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..a17d6078bb377ee2274a3f32ed4c4e31ed81a683 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_100.txt @@ -0,0 +1,11 @@ +{% for message in messages %} + {% if message['role'] == 'user' %} + {{ bos_token + '[INST] ' + message['content'] + ' [/INST]' }} + {% elif message['role'] == 'system' %} + {{ '<>\\n' + message['content'] + '\\n<>\\n\\n' }} + {% elif message['role'] == 'assistant' %} + {{ ' ' + message['content'] + ' ' + eos_token }} + {% endif %} +{% endfor %} +Hopefully if you stare at this for a little bit you can see what this template is doing - it adds specific tokens based +on the "role" of each message, which represents who sent it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_101.txt b/chunked/content_aware_chunking/_chat_templating/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb8c1b207f23868f6a82d66590836fc00c7d02c4 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_101.txt @@ -0,0 +1,2 @@ +User, assistant and system messages are clearly +distinguishable to the model because of the tokens they're wrapped in. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_102.txt b/chunked/content_aware_chunking/_chat_templating/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..cac69a0288165859e207e8a0416bb8a14a2d9ea4 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_102.txt @@ -0,0 +1,2 @@ +Advanced: Adding and editing chat templates +How do I create a chat template? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_103.txt b/chunked/content_aware_chunking/_chat_templating/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6005c7e27ad9dc5a4fb9350c8a22204b2cad8e4 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_103.txt @@ -0,0 +1 @@ +Simple, just write a jinja template and set tokenizer.chat_template. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_104.txt b/chunked/content_aware_chunking/_chat_templating/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..032f8ca5962b6d6eb69f4eaddcbb7ef16f68b247 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_104.txt @@ -0,0 +1,2 @@ +You may find it easier to start with an +existing template from another model and simply edit it for your needs! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_105.txt b/chunked/content_aware_chunking/_chat_templating/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..0aeb8d67a4b03443b82fc0f31d9a2c09552bdcf0 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_105.txt @@ -0,0 +1,12 @@ +For example, we could take the LLaMA template +above and add "[ASST]" and "[/ASST]" to assistant messages: +{% for message in messages %} + {% if message['role'] == 'user' %} + {{ bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }} + {% elif message['role'] == 'system' %} + {{ '<>\\n' + message['content'].strip() + '\\n<>\\n\\n' }} + {% elif message['role'] == 'assistant' %} + {{ '[ASST] ' + message['content'] + ' [/ASST]' + eos_token }} + {% endif %} +{% endfor %} +Now, simply set the tokenizer.chat_template attribute. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_106.txt b/chunked/content_aware_chunking/_chat_templating/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..dee8ca1c8ab1e3ff893906e237baec4d69309488 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_106.txt @@ -0,0 +1,2 @@ +Next time you use [~PreTrainedTokenizer.apply_chat_template], it will +use your new template! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_107.txt b/chunked/content_aware_chunking/_chat_templating/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..03ddda7b863b5416d7fac12e1d1ba11433d70674 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_107.txt @@ -0,0 +1,3 @@ +This attribute will be saved in the tokenizer_config.json file, so you can use +[~utils.PushToHubMixin.push_to_hub] to upload your new template to the Hub and make sure everyone's using the right +template for your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_108.txt b/chunked/content_aware_chunking/_chat_templating/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9232633e2c385f5ee16e093b041222d93d22525 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_108.txt @@ -0,0 +1,5 @@ +python +template = tokenizer.chat_template +template = template.replace("SYS", "SYSTEM") # Change the system token +tokenizer.chat_template = template # Set the new template +tokenizer.push_to_hub("model_name") # Upload your new template to the Hub! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_109.txt b/chunked/content_aware_chunking/_chat_templating/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..b561543a212234afd5bf03b783d4dfa42412c30e --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_109.txt @@ -0,0 +1,2 @@ +The method [~PreTrainedTokenizer.apply_chat_template] which uses your chat template is called by the [TextGenerationPipeline] class, so +once you set the correct chat template, your model will automatically become compatible with [TextGenerationPipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_110.txt b/chunked/content_aware_chunking/_chat_templating/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..74c6d68646a7188b7307b8dc47a471327e08e38d --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_110.txt @@ -0,0 +1,2 @@ +If you're fine-tuning a model for chat, in addition to setting a chat template, you should probably add any new chat +control tokens as special tokens in the tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_111.txt b/chunked/content_aware_chunking/_chat_templating/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..3eee7b3d76496cca5142243aede4cab428cef353 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_111.txt @@ -0,0 +1,2 @@ +Special tokens are never split, +ensuring that your control tokens are always handled as single tokens rather than being tokenized in pieces. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_112.txt b/chunked/content_aware_chunking/_chat_templating/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d7d2c09217f859275f6ce3ccbe44793e79ae889 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_112.txt @@ -0,0 +1,3 @@ +You +should also set the tokenizer's eos_token attribute to the token that marks the end of assistant generations in your +template. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_113.txt b/chunked/content_aware_chunking/_chat_templating/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c2e950b8fbfc98183e1d03eced8f9b0525dc5e2 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_113.txt @@ -0,0 +1 @@ +This will ensure that text generation tools can correctly figure out when to stop generating text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_114.txt b/chunked/content_aware_chunking/_chat_templating/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c34d80eef4b1cf034581888e430cede43ef6138 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_114.txt @@ -0,0 +1 @@ +What are "default" templates? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_115.txt b/chunked/content_aware_chunking/_chat_templating/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..27ae01075ada6c8d93e7eb3ad3efd490f6a98a59 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_115.txt @@ -0,0 +1 @@ +Before the introduction of chat templates, chat handling was hardcoded at the model class level. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_116.txt b/chunked/content_aware_chunking/_chat_templating/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd086625513d3019538c19159ccf315e18086aec --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_116.txt @@ -0,0 +1,2 @@ +For backwards +compatibility, we have retained this class-specific handling as default templates, also set at the class level. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_117.txt b/chunked/content_aware_chunking/_chat_templating/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4457c994f7487897a00a3eb495d54018da83d4c --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_117.txt @@ -0,0 +1,3 @@ +If a +model does not have a chat template set, but there is a default template for its model class, the TextGenerationPipeline +class and methods like apply_chat_template will use the class template instead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_118.txt b/chunked/content_aware_chunking/_chat_templating/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..a84c318c4d2b1d7f223a1c240058fd7eda7527ba --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_118.txt @@ -0,0 +1,2 @@ +You can find out what the default +template for your tokenizer is by checking the tokenizer.default_chat_template attribute. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_119.txt b/chunked/content_aware_chunking/_chat_templating/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1cd01a58479c108a9b75df908d94fb6b8079667 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_119.txt @@ -0,0 +1 @@ +This is something we do purely for backward compatibility reasons, to avoid breaking any existing workflows. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_120.txt b/chunked/content_aware_chunking/_chat_templating/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..778dd8f0b1add0bd2fcc79fa8ee240d3490a0da3 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_120.txt @@ -0,0 +1,4 @@ +Even when +the class template is appropriate for your model, we strongly recommend overriding the default template by +setting the chat_template attribute explicitly to make it clear to users that your model has been correctly configured +for chat, and to future-proof in case the default templates are ever altered or deprecated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_121.txt b/chunked/content_aware_chunking/_chat_templating/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..24b2313b9dd899e96fbd0786166f2401f69710fe --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_121.txt @@ -0,0 +1 @@ +What template should I use? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_122.txt b/chunked/content_aware_chunking/_chat_templating/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..65cb7cd8b7e34495752afdfb9713fb99110d3fef --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_122.txt @@ -0,0 +1,3 @@ +When setting the template for a model that's already been trained for chat, you should ensure that the template +exactly matches the message formatting that the model saw during training, or else you will probably experience +performance degradation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_123.txt b/chunked/content_aware_chunking/_chat_templating/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..342755bd82e90c96b18640dac163c138e5d50615 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_123.txt @@ -0,0 +1,2 @@ +This is true even if you're training the model further - you will probably get the best +performance if you keep the chat tokens constant. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_124.txt b/chunked/content_aware_chunking/_chat_templating/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..c98df36959dace2e6959508338e6189d3676ab4f --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_124.txt @@ -0,0 +1,2 @@ +This is very analogous to tokenization - you generally get the +best performance for inference or fine-tuning when you precisely match the tokenization used during training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_125.txt b/chunked/content_aware_chunking/_chat_templating/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5449710be1b0acc8676953a84c03432fa55d511 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_125.txt @@ -0,0 +1,2 @@ +If you're training a model from scratch, or fine-tuning a base language model for chat, on the other hand, +you have a lot of freedom to choose an appropriate template! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_126.txt b/chunked/content_aware_chunking/_chat_templating/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..e280fa0981701d58a0a11ad3af8b0e2a838f9af0 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_126.txt @@ -0,0 +1,2 @@ +LLMs are smart enough to learn to handle lots of different +input formats. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_127.txt b/chunked/content_aware_chunking/_chat_templating/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..87b507dc45dd10f8278bca6ab3ec5d50e6de3605 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_127.txt @@ -0,0 +1,2 @@ +Our default template for models that don't have a class-specific template follows the +ChatML format, and this is a good, flexible choice for many use-cases. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_128.txt b/chunked/content_aware_chunking/_chat_templating/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8576391a85f4b0855901f350152cb043ca44680 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_128.txt @@ -0,0 +1,5 @@ +It looks like this: +{% for message in messages %} + {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}} +{% endfor %} +If you like this one, here it is in one-liner form, ready to copy into your code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_129.txt b/chunked/content_aware_chunking/_chat_templating/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..f04b1d6ad5d8cd00f85b174c0c202409df9faf58 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_129.txt @@ -0,0 +1,2 @@ +The one-liner also includes +handy support for generation prompts, but note that it doesn't add BOS or EOS tokens! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_130.txt b/chunked/content_aware_chunking/_chat_templating/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d4d34329cc6c2e10ba2df94449a112906bcc337 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_130.txt @@ -0,0 +1,2 @@ +If your model expects those, they won't be added automatically by apply_chat_template - in other words, the +text will be tokenized with add_special_tokens=False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_131.txt b/chunked/content_aware_chunking/_chat_templating/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..625773ee95037d97fd3f70a9eac41c3e378b460f --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_131.txt @@ -0,0 +1,2 @@ +This is to avoid potential conflicts between the template and +the add_special_tokens logic. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_132.txt b/chunked/content_aware_chunking/_chat_templating/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..71c8a12a7eca727948696566286376e9dd7741e3 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_132.txt @@ -0,0 +1 @@ +If your model expects special tokens, make sure to add them to the template! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_133.txt b/chunked/content_aware_chunking/_chat_templating/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..c210a893f1e2412d36583aaaf9b95e04d1c91910 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_133.txt @@ -0,0 +1,4 @@ +python +tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" +This template wraps each message in <|im_start|> and <|im_end|> tokens, and simply writes the role as a string, which +allows for flexibility in the roles you train with. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_134.txt b/chunked/content_aware_chunking/_chat_templating/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..b89f99a512226c9158a3cc946d71a90bd1cdd72a --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_134.txt @@ -0,0 +1,10 @@ +The output looks like this: +text +<|im_start|>system +You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|> +<|im_start|>user +How are you?<|im_end|> +<|im_start|>assistant +I'm doing great!<|im_end|> +The "user", "system" and "assistant" roles are the standard for chat, and we recommend using them when it makes sense, +particularly if you want your model to operate well with [TextGenerationPipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_135.txt b/chunked/content_aware_chunking/_chat_templating/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2a077912d8405be5d431a66ab2f98115b33def0 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_135.txt @@ -0,0 +1,2 @@ +However, you are not limited +to these roles - templating is extremely flexible, and any string can be a role. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_136.txt b/chunked/content_aware_chunking/_chat_templating/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b248f3df72b2c7078607b45efc0f1b66fdd4ebe --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_136.txt @@ -0,0 +1 @@ +I want to add some chat templates! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_137.txt b/chunked/content_aware_chunking/_chat_templating/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1281123e7dd8841e9d9d1b2b9b9662d6688750a --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_137.txt @@ -0,0 +1 @@ +How should I get started? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_138.txt b/chunked/content_aware_chunking/_chat_templating/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..52a0bdb2ff5e17351d63e5be007b615cee864edb --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_138.txt @@ -0,0 +1,2 @@ +If you have any chat models, you should set their tokenizer.chat_template attribute and test it using +[~PreTrainedTokenizer.apply_chat_template], then push the updated tokenizer to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_139.txt b/chunked/content_aware_chunking/_chat_templating/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6f76afd9c67ee62cb3f82bcc2e04aa20ee597b0 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_139.txt @@ -0,0 +1,3 @@ +This applies even if you're +not the model owner - if you're using a model with an empty chat template, or one that's still using the default class +template, please open a pull request to the model repository so that this attribute can be set properly! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_140.txt b/chunked/content_aware_chunking/_chat_templating/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..070a2404d352791c7502730feb0e76b107b29128 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_140.txt @@ -0,0 +1 @@ +Once the attribute is set, that's it, you're done! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_141.txt b/chunked/content_aware_chunking/_chat_templating/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8ad787f05e2149e6750588abda9ee0559e302fb --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_141.txt @@ -0,0 +1,2 @@ +tokenizer.apply_chat_template will now work correctly for that +model, which means it is also automatically supported in places like TextGenerationPipeline! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_142.txt b/chunked/content_aware_chunking/_chat_templating/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..178a8068f08de2637c30e10a478b6563a602e58b --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_142.txt @@ -0,0 +1,2 @@ +By ensuring that models have this attribute, we can make sure that the whole community gets to use the full power of +open-source models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_143.txt b/chunked/content_aware_chunking/_chat_templating/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..aec84b1ee27955d306965f00655eac84ebce27aa --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_143.txt @@ -0,0 +1,2 @@ +Formatting mismatches have been haunting the field and silently harming performance for too long - +it's time to put an end to them! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_144.txt b/chunked/content_aware_chunking/_chat_templating/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d958e93cae7e582f9da664712e441025d48063b --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_144.txt @@ -0,0 +1,3 @@ +Advanced: Template writing tips +If you're unfamiliar with Jinja, we generally find that the easiest way to write a chat template is to first +write a short Python script that formats messages the way you want, and then convert that script into a template. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_145.txt b/chunked/content_aware_chunking/_chat_templating/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0a8da35d0e58c32cb2f1215b308e9ddc6bafff6 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_145.txt @@ -0,0 +1 @@ +Remember that the template handler will receive the conversation history as a variable called messages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_146.txt b/chunked/content_aware_chunking/_chat_templating/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b3ce11866fd407b32683e0e2700f23092c0a149 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_146.txt @@ -0,0 +1,2 @@ +Each +message is a dictionary with two keys, role and content. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_147.txt b/chunked/content_aware_chunking/_chat_templating/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..574e82f9f72f46bd44c2a338ee5d9dad71fd16b7 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_147.txt @@ -0,0 +1,3 @@ +You will be able to access messages in your template +just like you can in Python, which means you can loop over it with {% for message in messages %} or access +individual messages with, for example, {{ messages[0] }}. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_148.txt b/chunked/content_aware_chunking/_chat_templating/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d418e891dd29049e7fb034ccf0a25b35249b6f7 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_148.txt @@ -0,0 +1,7 @@ +You can also use the following tips to convert your code to Jinja: +For loops +For loops in Jinja look like this: +{% for message in messages %} +{{ message['content'] }} +{% endfor %} +Note that whatever's inside the {{ expression block }} will be printed to the output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_149.txt b/chunked/content_aware_chunking/_chat_templating/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe54aef800714b7675be0f2c2fa41659277888ac --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_149.txt @@ -0,0 +1,2 @@ +You can use operators like ++ to combine strings inside expression blocks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_150.txt b/chunked/content_aware_chunking/_chat_templating/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ccae163d04f41db1e4ae398e535396815529959 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_150.txt @@ -0,0 +1,7 @@ +If statements +If statements in Jinja look like this: +{% if message['role'] == 'user' %} +{{ message['content'] }} +{% endif %} +Note how where Python uses whitespace to mark the beginnings and ends of for and if blocks, Jinja requires you +to explicitly end them with {% endfor %} and {% endif %}. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_151.txt b/chunked/content_aware_chunking/_chat_templating/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..0afc4c436909b2a570271270a6e22eaf3ac06abe --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_151.txt @@ -0,0 +1,3 @@ +Special variables +Inside your template, you will have access to the list of messages, but you can also access several other special +variables. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_152.txt b/chunked/content_aware_chunking/_chat_templating/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..9605efc8eb7d6716d9416a7a90f5b831c137ddb1 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_152.txt @@ -0,0 +1,2 @@ +These include special tokens like bos_token and eos_token, as well as the add_generation_prompt +variable that we discussed above. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_153.txt b/chunked/content_aware_chunking/_chat_templating/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..b280f14293d43216604451c069a9f5b31c2cded6 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_153.txt @@ -0,0 +1,3 @@ +You can also use the loop variable to access information about the current loop +iteration, for example using {% if loop.last %} to check if the current message is the last message in the +conversation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_154.txt b/chunked/content_aware_chunking/_chat_templating/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..12924d2af1d87971a4a8cc46f490f2e5fd50f5ac --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_154.txt @@ -0,0 +1,7 @@ +Here's an example that puts these ideas together to add a generation prompt at the end of the +conversation if add_generation_prompt is True: +{% if loop.last and add_generation_prompt %} +{{ bos_token + 'Assistant:\n' }} +{% endif %} +Notes on whitespace +As much as possible, we've tried to get Jinja to ignore whitespace outside of {{ expressions }}. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_155.txt b/chunked/content_aware_chunking/_chat_templating/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f4589320659e1c1a27f762fb8e529c5c1c2af26 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_155.txt @@ -0,0 +1,3 @@ +However, be aware +that Jinja is a general-purpose templating engine, and it may treat whitespace between blocks on the same line +as significant and print it to the output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_156.txt b/chunked/content_aware_chunking/_chat_templating/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd9658ff3653969507aa64a68baeacd3e5245c0e --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_156.txt @@ -0,0 +1,2 @@ +We strongly recommend checking that your template isn't printing extra +spaces where it shouldn't be before you upload it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_46.txt b/chunked/content_aware_chunking/_chat_templating/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6714046aca403d8b35ddb18c6ef2d372b151cdf --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_46.txt @@ -0,0 +1,11 @@ +Let's try the Zephyr example again, but this time using +a pipeline: +thon +from transformers import pipeline +pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta") +messages = [ + { + "role": "system", + "content": "You are a friendly chatbot who always responds in the style of a pirate", + }, + {"role": "user", "content": "How many helicopters can a human eat in one sitting? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_47.txt b/chunked/content_aware_chunking/_chat_templating/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b4862b0989c2aa57b54bed5b8dcb018c005a4ab --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_47.txt @@ -0,0 +1,6 @@ +"}, +] +print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1]) # Print the assistant's response + +text +{'role': 'assistant', 'content': "Matey, I'm afraid I must inform ye that humans cannot eat helicopters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_48.txt b/chunked/content_aware_chunking/_chat_templating/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffab1221649abbe24e09b8daaa823a4ed0cf5b52 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_48.txt @@ -0,0 +1 @@ +Helicopters are not food, they are flying machines. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_49.txt b/chunked/content_aware_chunking/_chat_templating/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..06a4f1b11bb9c4b2c399a3a61d67c84d2b19af56 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_49.txt @@ -0,0 +1 @@ +Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_50.txt b/chunked/content_aware_chunking/_chat_templating/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..e602cdae43a86c0499518c6e3d3db62ba82831fb --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_50.txt @@ -0,0 +1 @@ +But helicopters, they be for transportin' and movin' around, not for eatin'. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_51.txt b/chunked/content_aware_chunking/_chat_templating/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..d38bf5ba7a803618ba581ebf465588a48c43315f --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_51.txt @@ -0,0 +1 @@ +So, I'd say none, me hearties. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_52.txt b/chunked/content_aware_chunking/_chat_templating/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..8daff91ba4d643033f8733f5069d8906ef908df7 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_52.txt @@ -0,0 +1 @@ +None at all."} \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_53.txt b/chunked/content_aware_chunking/_chat_templating/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f09aa40ba87a00d38adbdabbb968951c3f545bb --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_53.txt @@ -0,0 +1,2 @@ +The pipeline will take care of all the details of tokenization and calling apply_chat_template for you - +once the model has a chat template, all you need to do is initialize the pipeline and pass it the list of messages! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_54.txt b/chunked/content_aware_chunking/_chat_templating/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e3adaa28e76b766b12092523c494e603514a363 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_54.txt @@ -0,0 +1 @@ +What are "generation prompts"? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_55.txt b/chunked/content_aware_chunking/_chat_templating/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..590d460c004d81b30744ea4d2a5d54fb952e2a8a --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_55.txt @@ -0,0 +1 @@ +You may have noticed that the apply_chat_template method has an add_generation_prompt argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_56.txt b/chunked/content_aware_chunking/_chat_templating/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..90b8425e63da9911be571ded91ee767875a4e71d --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_56.txt @@ -0,0 +1,2 @@ +This argument tells +the template to add tokens that indicate the start of a bot response. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_57.txt b/chunked/content_aware_chunking/_chat_templating/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..84db913ad5e2d11992154a041091dfe022f52925 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_57.txt @@ -0,0 +1,4 @@ +For example, consider the following chat: +python +messages = [ + {"role": "user", "content": "Hi there! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_58.txt b/chunked/content_aware_chunking/_chat_templating/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6e756aea254b9d47194659e23e25058abd0b01a --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_58.txt @@ -0,0 +1,2 @@ +"}, + {"role": "assistant", "content": "Nice to meet you! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_59.txt b/chunked/content_aware_chunking/_chat_templating/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..76d1442ca7b02834570732b413eef0af6e820471 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_59.txt @@ -0,0 +1,2 @@ +"}, + {"role": "user", "content": "Can I ask a question?"} \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_60.txt b/chunked/content_aware_chunking/_chat_templating/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..26e74ef3343fbe911753a79de8e76e1f42e4091f --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_60.txt @@ -0,0 +1,23 @@ +] +Here's what this will look like without a generation prompt, using the ChatML template we saw in the Zephyr example: +python +tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) +"""<|im_start|>user +Hi there!<|im_end|> +<|im_start|>assistant +Nice to meet you!<|im_end|> +<|im_start|>user +Can I ask a question?<|im_end|> +""" +And here's what it looks like with a generation prompt: +python +tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +"""<|im_start|>user +Hi there!<|im_end|> +<|im_start|>assistant +Nice to meet you!<|im_end|> +<|im_start|>user +Can I ask a question?<|im_end|> +<|im_start|>assistant +""" +Note that this time, we've added the tokens that indicate the start of a bot response. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_61.txt b/chunked/content_aware_chunking/_chat_templating/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..5005eb224a77708e27b60cee4d7e03be3b8ca85f --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_61.txt @@ -0,0 +1,3 @@ +This ensures that when the model +generates text it will write a bot response instead of doing something unexpected, like continuing the user's +message. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_62.txt b/chunked/content_aware_chunking/_chat_templating/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..961e6a4670bf93ca66baf7c1836572ddb2bd3d8d --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_62.txt @@ -0,0 +1,2 @@ +Remember, chat models are still just language models - they're trained to continue text, and chat is just a +special kind of text to them! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_63.txt b/chunked/content_aware_chunking/_chat_templating/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..500d0ba79609f90e3df5867ec7f39cfbb868471c --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_63.txt @@ -0,0 +1,2 @@ +You need to guide them with appropriate control tokens, so they know what they're +supposed to be doing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_64.txt b/chunked/content_aware_chunking/_chat_templating/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..307a67363fc9176cf9fc6d9cbb68c0d01581b453 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_64.txt @@ -0,0 +1 @@ +Not all models require generation prompts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_65.txt b/chunked/content_aware_chunking/_chat_templating/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2aa7825c0fd06035d16103a4797c75f4bb89013 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_65.txt @@ -0,0 +1,2 @@ +Some models, like BlenderBot and LLaMA, don't have any +special tokens before bot responses. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_66.txt b/chunked/content_aware_chunking/_chat_templating/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..3dbcdd6d8d9c50e3d3e8b34cdee00a4e4ab7977b --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_66.txt @@ -0,0 +1 @@ +In these cases, the add_generation_prompt argument will have no effect. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_67.txt b/chunked/content_aware_chunking/_chat_templating/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..da18f3dbe928caefb2580c7be0b4018f52441e1f --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_67.txt @@ -0,0 +1,2 @@ +The exact +effect that add_generation_prompt has will depend on the template being used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_68.txt b/chunked/content_aware_chunking/_chat_templating/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b2082f5680ac8ceb3abdc241497434e787c318a --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_68.txt @@ -0,0 +1 @@ +Can I use chat templates in training? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_69.txt b/chunked/content_aware_chunking/_chat_templating/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..399dfe6476443bd5a6d0e98542fc70210e8df718 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_69.txt @@ -0,0 +1 @@ +Yes! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_70.txt b/chunked/content_aware_chunking/_chat_templating/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..fba2a123f3359ff190fc37b92493890acdf89f0b --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_70.txt @@ -0,0 +1 @@ +We recommend that you apply the chat template as a preprocessing step for your dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_71.txt b/chunked/content_aware_chunking/_chat_templating/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..2832939c0278586ee160c5f0331cdad18677b587 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_71.txt @@ -0,0 +1,2 @@ +After this, you +can simply continue like any other language model training task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_72.txt b/chunked/content_aware_chunking/_chat_templating/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..da9ada9d0ca014e199daebf48e5256300962eb57 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_72.txt @@ -0,0 +1,3 @@ +When training, you should usually set +add_generation_prompt=False, because the added tokens to prompt an assistant response will not be helpful during +training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_73.txt b/chunked/content_aware_chunking/_chat_templating/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..7394e69d0fe7e6059c01fe925de3c5eac8d7cb51 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_73.txt @@ -0,0 +1,7 @@ +Let's see an example: +thon +from transformers import AutoTokenizer +from datasets import Dataset +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") +chat1 = [ + {"role": "user", "content": "Which is bigger, the moon or the sun? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_74.txt b/chunked/content_aware_chunking/_chat_templating/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfda4813f3768fb1c74af2013e2550bbf8aba050 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_74.txt @@ -0,0 +1,2 @@ +"}, + {"role": "assistant", "content": "The sun."} \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_75.txt b/chunked/content_aware_chunking/_chat_templating/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..89e83212e5769dc3f8f93082f678271264a8640a --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_75.txt @@ -0,0 +1,3 @@ +] +chat2 = [ + {"role": "user", "content": "Which is bigger, a virus or a bacterium? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_76.txt b/chunked/content_aware_chunking/_chat_templating/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0dcd46f5df10d905f5478bba68a106484e2f5db --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_76.txt @@ -0,0 +1,2 @@ +"}, + {"role": "assistant", "content": "A bacterium."} \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_77.txt b/chunked/content_aware_chunking/_chat_templating/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..00ef546f1a9d9e213c13dea189e5a8c36fe4a036 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_77.txt @@ -0,0 +1,7 @@ +] +dataset = Dataset.from_dict({"chat": [chat1, chat2]}) +dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)}) +print(dataset['formatted_chat'][0]) +And we get:text +<|user|> +Which is bigger, the moon or the sun? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_78.txt b/chunked/content_aware_chunking/_chat_templating/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..44ab0ca2367f6b23a01f7bf47b9e4796017cd7ee --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_78.txt @@ -0,0 +1,2 @@ +<|assistant|> +The sun. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_79.txt b/chunked/content_aware_chunking/_chat_templating/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..578b68ae4bbeeef80895fbf249d3f64b65f7cd3c --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_79.txt @@ -0,0 +1 @@ +From here, just continue training like you would with a standard language modelling task, using the formatted_chat column. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_80.txt b/chunked/content_aware_chunking/_chat_templating/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..90967aa88caf2b1ca0af64ada72dfdb00f440f85 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_80.txt @@ -0,0 +1 @@ +Advanced: How do chat templates work? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_81.txt b/chunked/content_aware_chunking/_chat_templating/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..2acdab02481345341d06425610256be443c8e038 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_81.txt @@ -0,0 +1 @@ +The chat template for a model is stored on the tokenizer.chat_template attribute. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_82.txt b/chunked/content_aware_chunking/_chat_templating/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..644a1df805c9cb58e764c915439c140827c51adc --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_82.txt @@ -0,0 +1,2 @@ +If no chat template is set, the +default template for that model class is used instead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_83.txt b/chunked/content_aware_chunking/_chat_templating/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..49f00211de87779c6f2d45b270532d2b33366c81 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_83.txt @@ -0,0 +1,9 @@ +Let's take a look at the template for BlenderBot: +thon + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") +tokenizer.default_chat_template +"{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" + +That's kind of intimidating. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_84.txt b/chunked/content_aware_chunking/_chat_templating/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..949a984b2c071856bb508e32f0d572386170d6cf --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_84.txt @@ -0,0 +1 @@ +Let's add some newlines and indentation to make it more readable. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_85.txt b/chunked/content_aware_chunking/_chat_templating/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..3261619af04ce65ac58f294e92593e77b9bec0a0 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_85.txt @@ -0,0 +1,3 @@ +Note that the first +newline after each block as well as any preceding whitespace before a block are ignored by default, using the +Jinja trim_blocks and lstrip_blocks flags. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_86.txt b/chunked/content_aware_chunking/_chat_templating/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..caa700a9661bec10ca11ab27583b0022fac22f9b --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_86.txt @@ -0,0 +1,2 @@ +However, be cautious - although leading whitespace on each +line is stripped, spaces between blocks on the same line are not. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_87.txt b/chunked/content_aware_chunking/_chat_templating/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..772932220adbde5ec0b667345975633ec9db9d13 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_87.txt @@ -0,0 +1,2 @@ +We strongly recommend checking that your template +isn't printing extra spaces where it shouldn't be! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_88.txt b/chunked/content_aware_chunking/_chat_templating/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..9464931e14b864b6ad7268e7c370b467e97af929 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_88.txt @@ -0,0 +1,11 @@ +{% for message in messages %} + {% if message['role'] == 'user' %} + {{ ' ' }} + {% endif %} + {{ message['content'] }} + {% if not loop.last %} + {{ ' ' }} + {% endif %} +{% endfor %} +{{ eos_token }} +If you've never seen one of these before, this is a Jinja template. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_89.txt b/chunked/content_aware_chunking/_chat_templating/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..af887cda9fcb262fcc55349848520efa33fbb1d1 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_89.txt @@ -0,0 +1 @@ +Jinja is a templating language that allows you to write simple code that generates text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_90.txt b/chunked/content_aware_chunking/_chat_templating/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..88c981f712df42cbd12e7bd543a1f90d52a31df3 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_90.txt @@ -0,0 +1,2 @@ +In many ways, the code and +syntax resembles Python. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_91.txt b/chunked/content_aware_chunking/_chat_templating/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..a01b7d07a51234d67987087d4013fb8150ff6d14 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_91.txt @@ -0,0 +1,11 @@ +In pure Python, this template would look something like this: +python +for idx, message in enumerate(messages): + if message['role'] == 'user': + print(' ') + print(message['content']) + if not idx == len(messages) - 1: # Check for the last message in the conversation + print(' ') +print(eos_token) +Effectively, the template does three things: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_92.txt b/chunked/content_aware_chunking/_chat_templating/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd82be2c1666f3cffa9118de995d4499eed71031 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_92.txt @@ -0,0 +1 @@ +For each message, if the message is a user message, add a blank space before it, otherwise print nothing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_93.txt b/chunked/content_aware_chunking/_chat_templating/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_93.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_94.txt b/chunked/content_aware_chunking/_chat_templating/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..6980645b0697e7106d1f30085bb1d6a4ff61e5e3 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_94.txt @@ -0,0 +1,2 @@ +Add the message content +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_95.txt b/chunked/content_aware_chunking/_chat_templating/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4635bf384c72e222015a033c91678d08039e5c2 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_95.txt @@ -0,0 +1 @@ +If the message is not the last message, add two spaces after it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_96.txt b/chunked/content_aware_chunking/_chat_templating/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a35647a927803d6ebe52af1bbf621af1a132806 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_96.txt @@ -0,0 +1 @@ +After the final message, print the EOS token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_97.txt b/chunked/content_aware_chunking/_chat_templating/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecef6ad53f922589b0a096f2fb0a69e8e64d5132 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_97.txt @@ -0,0 +1,2 @@ +This is a pretty simple template - it doesn't add any control tokens, and it doesn't support "system" messages, which +are a common way to give the model directives about how it should behave in the subsequent conversation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_98.txt b/chunked/content_aware_chunking/_chat_templating/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ab7778cc2bdbd9f2187e98af984541cbfec019e --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_98.txt @@ -0,0 +1 @@ +But Jinja gives you a lot of flexibility to do those things! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_chat_templating/chunk_99.txt b/chunked/content_aware_chunking/_chat_templating/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..71b0e71651c1c21800b779439643cbefc21a2b37 --- /dev/null +++ b/chunked/content_aware_chunking/_chat_templating/chunk_99.txt @@ -0,0 +1,3 @@ +Let's see a Jinja template that can format inputs +similarly to the way LLaMA formats them (note that the real LLaMA template includes handling for default system +messages and slightly different system message handling in general - don't use this one in your actual code!) \ No newline at end of file diff --git a/chunked/content_aware_chunking/_community/chunk_5.txt b/chunked/content_aware_chunking/_community/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..5db1570ff2f62431fc721d22960f499a07ccc374 --- /dev/null +++ b/chunked/content_aware_chunking/_community/chunk_5.txt @@ -0,0 +1,2 @@ +| Tanmay Thakur | | +|Fine Tune BlenderBotSmall for Summarization using the Trainer API| How to fine-tune BlenderBotSmall for summarization on a custom dataset, using the Trainer API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_community/chunk_6.txt b/chunked/content_aware_chunking/_community/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca022ed3f970b7c3aad1f1f9a68d33893f8df43e --- /dev/null +++ b/chunked/content_aware_chunking/_community/chunk_6.txt @@ -0,0 +1,7 @@ +| Tanmay Thakur | | +|Fine-tune Electra and interpret with Integrated Gradients | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | Eliza Szczechla | | +|fine-tune a non-English GPT-2 Model with Trainer class | How to fine-tune a non-English GPT-2 Model with Trainer class | Philipp Schmid | | +|Fine-tune a DistilBERT Model for Multi Label Classification task | How to fine-tune a DistilBERT Model for Multi Label Classification task | Dhaval Taunk | | +|Fine-tune ALBERT for sentence-pair classification | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | Nadir El Manouzi | | +|Fine-tune Roberta for sentiment analysis | How to fine-tune a Roberta model for sentiment analysis | Dhaval Taunk | | +|Evaluating Question Generation Models | How accurate are the answers to questions generated by your seq2seq transformer model? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_community/chunk_7.txt b/chunked/content_aware_chunking/_community/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..51af90901471fd8aae1c3931e0b80460d2130edd --- /dev/null +++ b/chunked/content_aware_chunking/_community/chunk_7.txt @@ -0,0 +1,26 @@ +| Pascal Zoleko | | +|Classify text with DistilBERT and Tensorflow | How to fine-tune DistilBERT for text classification in TensorFlow | Peter Bayerle | | +|Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail | How to warm-start a EncoderDecoderModel with a google-bert/bert-base-uncased checkpoint for summarization on CNN/Dailymail | Patrick von Platen | | +|Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum | How to warm-start a shared EncoderDecoderModel with a FacebookAI/roberta-base checkpoint for summarization on BBC/XSum | Patrick von Platen | | +|Fine-tune TAPAS on Sequential Question Answering (SQA) | How to fine-tune TapasForQuestionAnswering with a tapas-base checkpoint on the Sequential Question Answering (SQA) dataset | Niels Rogge | | +|Evaluate TAPAS on Table Fact Checking (TabFact) | How to evaluate a fine-tuned TapasForSequenceClassification with a tapas-base-finetuned-tabfact checkpoint using a combination of the 🤗 datasets and 🤗 transformers libraries | Niels Rogge | | +|Fine-tuning mBART for translation | How to fine-tune mBART using Seq2SeqTrainer for Hindi to English translation | Vasudev Gupta | | +|Fine-tune LayoutLM on FUNSD (a form understanding dataset) | How to fine-tune LayoutLMForTokenClassification on the FUNSD dataset for information extraction from scanned documents | Niels Rogge | | +|Fine-Tune DistilGPT2 and Generate Text | How to fine-tune DistilGPT2 and generate text | Aakash Tripathi | | +|Fine-Tune LED on up to 8K tokens | How to fine-tune LED on pubmed for long-range summarization | Patrick von Platen | | +|Evaluate LED on Arxiv | How to effectively evaluate LED on long-range summarization | Patrick von Platen | | +|Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset) | How to fine-tune LayoutLMForSequenceClassification on the RVL-CDIP dataset for scanned document classification | Niels Rogge | | +|Wav2Vec2 CTC decoding with GPT2 adjustment | How to decode CTC sequence with language model adjustment | Eric Lam | | +|Fine-tune BART for summarization in two languages with Trainer class | How to fine-tune BART for summarization in two languages with Trainer class | Eliza Szczechla | | +|Evaluate Big Bird on Trivia QA | How to evaluate BigBird on long document question answering on Trivia QA | Patrick von Platen | | +| Create video captions using Wav2Vec2 | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | Niklas Muennighoff | | +| Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and PyTorch Lightning | Niels Rogge | | +| Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and the 🤗 Trainer | Niels Rogge | | +| Evaluate LUKE on Open Entity, an entity typing dataset | How to evaluate LukeForEntityClassification on the Open Entity dataset | Ikuya Yamada | | +| Evaluate LUKE on TACRED, a relation extraction dataset | How to evaluate LukeForEntityPairClassification on the TACRED dataset | Ikuya Yamada | | +| Evaluate LUKE on CoNLL-2003, an important NER benchmark | How to evaluate LukeForEntitySpanClassification on the CoNLL-2003 dataset | Ikuya Yamada | | +| Evaluate BigBird-Pegasus on PubMed dataset | How to evaluate BigBirdPegasusForConditionalGeneration on PubMed dataset | Vasudev Gupta | | +| Speech Emotion Classification with Wav2Vec2 | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | Mehrdad Farahani | | +| Detect objects in an image with DETR | How to use a trained DetrForObjectDetection model to detect objects in an image and visualize attention | Niels Rogge | | +| Fine-tune DETR on a custom object detection dataset | How to fine-tune DetrForObjectDetection on a custom object detection dataset | Niels Rogge | | +| Finetune T5 for Named Entity Recognition | How to fine-tune T5 on a Named Entity Recognition Task | Ogundepo Odunayo | | \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_33.txt b/chunked/content_aware_chunking/_create_a_model/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c5d09a58092631a2cbfae8370338f5202e86997 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_33.txt @@ -0,0 +1 @@ +You won't be able to use this model for anything useful yet until you train it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_34.txt b/chunked/content_aware_chunking/_create_a_model/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..696098edf27b9596257466d073b18cd3ba73ae6b --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_34.txt @@ -0,0 +1 @@ +Training is a costly and time-consuming process. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_35.txt b/chunked/content_aware_chunking/_create_a_model/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f05b51fde0f72af9ff270c173e9756723aadfea --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_35.txt @@ -0,0 +1 @@ +It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_36.txt b/chunked/content_aware_chunking/_create_a_model/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d8eaafc1a51c13463db009fa0ce705775d472d7 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_36.txt @@ -0,0 +1,5 @@ +Create a pretrained model with [~TFPreTrainedModel.from_pretrained]: + +tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased") + +When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_37.txt b/chunked/content_aware_chunking/_create_a_model/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..532a0e80d60a3daa6f66c9875d26062142cc77a6 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_37.txt @@ -0,0 +1,6 @@ +However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like: + +tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config) + +Model heads +At this point, you have a base DistilBERT model which outputs the hidden states. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_38.txt b/chunked/content_aware_chunking/_create_a_model/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..97fbd185e51262656bcd575096d63ff7264a0bfd --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_38.txt @@ -0,0 +1 @@ +The hidden states are passed as inputs to a model head to produce the final output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_39.txt b/chunked/content_aware_chunking/_create_a_model/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e552f822c670763c295ba6bc6d0d67a7dc7702a --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_39.txt @@ -0,0 +1 @@ +🤗 Transformers provides a different model head for each task as long as a model supports the task (i.e., you can't use DistilBERT for a sequence-to-sequence task like translation). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_40.txt b/chunked/content_aware_chunking/_create_a_model/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..924f6d3fb0bf53c54fea13911514f0100ca56899 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_40.txt @@ -0,0 +1 @@ +For example, [DistilBertForSequenceClassification] is a base DistilBERT model with a sequence classification head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_41.txt b/chunked/content_aware_chunking/_create_a_model/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d4425b98a6068f2577087301ecb29106e523b48 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_41.txt @@ -0,0 +1 @@ +The sequence classification head is a linear layer on top of the pooled outputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_42.txt b/chunked/content_aware_chunking/_create_a_model/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b024d8449a52cdce5c5eee12c567226c8e13e2e --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_42.txt @@ -0,0 +1,4 @@ +from transformers import DistilBertForSequenceClassification +model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + +Easily reuse this checkpoint for another task by switching to a different model head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_43.txt b/chunked/content_aware_chunking/_create_a_model/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..30df6d5e9d1d86a31ad276de122d925d6eb27c61 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_43.txt @@ -0,0 +1 @@ +For a question answering task, you would use the [DistilBertForQuestionAnswering] model head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_44.txt b/chunked/content_aware_chunking/_create_a_model/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e50a5ce6496e036e1f3ebe05d69f08290194da9 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_44.txt @@ -0,0 +1 @@ +The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_45.txt b/chunked/content_aware_chunking/_create_a_model/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..96757261b47390e242a5420fd3d769b6435c252c --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_45.txt @@ -0,0 +1,6 @@ +from transformers import DistilBertForQuestionAnswering +model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") +`` + + +For example, [TFDistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_46.txt b/chunked/content_aware_chunking/_create_a_model/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d4425b98a6068f2577087301ecb29106e523b48 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_46.txt @@ -0,0 +1 @@ +The sequence classification head is a linear layer on top of the pooled outputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_47.txt b/chunked/content_aware_chunking/_create_a_model/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ffa2c32bac109440fd747329d086884c8002137 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_47.txt @@ -0,0 +1,4 @@ +from transformers import TFDistilBertForSequenceClassification +tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + +Easily reuse this checkpoint for another task by switching to a different model head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_48.txt b/chunked/content_aware_chunking/_create_a_model/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a0d727587d77e9e610a194bf1d33355028e725d --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_48.txt @@ -0,0 +1 @@ +For a question answering task, you would use the [TFDistilBertForQuestionAnswering] model head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_49.txt b/chunked/content_aware_chunking/_create_a_model/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e50a5ce6496e036e1f3ebe05d69f08290194da9 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_49.txt @@ -0,0 +1 @@ +The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_50.txt b/chunked/content_aware_chunking/_create_a_model/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..face53b4456dc24a3127f88957c799cb0a978719 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_50.txt @@ -0,0 +1,5 @@ +from transformers import TFDistilBertForQuestionAnswering +tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") + +Tokenizer +The last base class you need before using a model for textual data is a tokenizer to convert raw text to tensors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_51.txt b/chunked/content_aware_chunking/_create_a_model/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..997e570c96188aa06c34146132846309042a2c78 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_51.txt @@ -0,0 +1,3 @@ +There are two types of tokenizers you can use with 🤗 Transformers: + +[PreTrainedTokenizer]: a Python implementation of a tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_52.txt b/chunked/content_aware_chunking/_create_a_model/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..b165734e06485f0556bec0cf2213ee8f9db92901 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_52.txt @@ -0,0 +1 @@ +[PreTrainedTokenizerFast]: a tokenizer from our Rust-based 🤗 Tokenizer library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_53.txt b/chunked/content_aware_chunking/_create_a_model/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..626be49d984bd6823fb61b28e1391b54451a5f7a --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_53.txt @@ -0,0 +1 @@ +This tokenizer type is significantly faster - especially during batch tokenization - due to its Rust implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_54.txt b/chunked/content_aware_chunking/_create_a_model/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad69d706a33953b3b5ec0c4ee08bbccd29d2e387 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_54.txt @@ -0,0 +1 @@ +The fast tokenizer also offers additional methods like offset mapping which maps tokens to their original words or characters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_55.txt b/chunked/content_aware_chunking/_create_a_model/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ec493ab42bbc79ed157f648002d9f8976f3a664 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_55.txt @@ -0,0 +1 @@ +Both tokenizers support common methods such as encoding and decoding, adding new tokens, and managing special tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_56.txt b/chunked/content_aware_chunking/_create_a_model/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfb9aa265da79c48cb45872a9f7d8d5e4138eb23 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_56.txt @@ -0,0 +1 @@ +Not every model supports a fast tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_57.txt b/chunked/content_aware_chunking/_create_a_model/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a99804c46ea070fc85143bf0ae0841b340e8b14 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_57.txt @@ -0,0 +1 @@ +Take a look at this table to check if a model has fast tokenizer support. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_58.txt b/chunked/content_aware_chunking/_create_a_model/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e424c67d10e6d3ce8f16d136f48c6932b1f1d26 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_58.txt @@ -0,0 +1,6 @@ +If you trained your own tokenizer, you can create one from your vocabulary file: + +from transformers import DistilBertTokenizer +my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left") + +It is important to remember the vocabulary from a custom tokenizer will be different from the vocabulary generated by a pretrained model's tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_59.txt b/chunked/content_aware_chunking/_create_a_model/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..f418f02b1b72358b666f61296ede1a46471fd37b --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_59.txt @@ -0,0 +1 @@ +You need to use a pretrained model's vocabulary if you are using a pretrained model, otherwise the inputs won't make sense. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_60.txt b/chunked/content_aware_chunking/_create_a_model/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b4e21935cf193fbbed46dd33cc84baac026402d --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_60.txt @@ -0,0 +1,11 @@ +Create a tokenizer with a pretrained model's vocabulary with the [DistilBertTokenizer] class: + +from transformers import DistilBertTokenizer +slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased") + +Create a fast tokenizer with the [DistilBertTokenizerFast] class: + +from transformers import DistilBertTokenizerFast +fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased") + +By default, [AutoTokenizer] will try to load a fast tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_61.txt b/chunked/content_aware_chunking/_create_a_model/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..e057e29f2f01972dbbbc6dc85b89f57ce7ac27fb --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_61.txt @@ -0,0 +1 @@ +You can disable this behavior by setting use_fast=False in from_pretrained. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_62.txt b/chunked/content_aware_chunking/_create_a_model/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..a83b7a025cde6026fe6e11fed9c3bfe6b891a8f9 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_62.txt @@ -0,0 +1,2 @@ +Image processor +An image processor processes vision inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_63.txt b/chunked/content_aware_chunking/_create_a_model/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..789a47defe40d7ba820faf0136a3cba873d08934 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_63.txt @@ -0,0 +1 @@ +It inherits from the base [~image_processing_utils.ImageProcessingMixin] class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_64.txt b/chunked/content_aware_chunking/_create_a_model/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f0405330f8807dcab9c1bcbf19418066a096c15 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_64.txt @@ -0,0 +1 @@ +To use, create an image processor associated with the model you're using. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_65.txt b/chunked/content_aware_chunking/_create_a_model/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..07295fe13e27716b49c12f8dfdfcd95b0cfb79dc --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_65.txt @@ -0,0 +1,24 @@ +For example, create a default [ViTImageProcessor] if you are using ViT for image classification: + +from transformers import ViTImageProcessor +vit_extractor = ViTImageProcessor() +print(vit_extractor) +ViTImageProcessor { + "do_normalize": true, + "do_resize": true, + "image_processor_type": "ViTImageProcessor", + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": 2, + "size": 224 +} + +If you aren't looking for any customization, just use the from_pretrained method to load a model's default image processor parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_66.txt b/chunked/content_aware_chunking/_create_a_model/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..5daf6e1080427fb94e560de4a2527f495b63c58f --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_66.txt @@ -0,0 +1,26 @@ +Modify any of the [ViTImageProcessor] parameters to create your custom image processor: + +from transformers import ViTImageProcessor +my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3]) +print(my_vit_extractor) +ViTImageProcessor { + "do_normalize": false, + "do_resize": true, + "image_processor_type": "ViTImageProcessor", + "image_mean": [ + 0.3, + 0.3, + 0.3 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": "PIL.Image.BOX", + "size": 224 +} + +Backbone + +Computer vision models consist of a backbone, neck, and head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_67.txt b/chunked/content_aware_chunking/_create_a_model/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..58eae897b478114b3ecffcabc7520a59339ada44 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_67.txt @@ -0,0 +1 @@ +The backbone extracts features from an input image, the neck combines and enhances the extracted features, and the head is used for the main task (e.g., object detection). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_68.txt b/chunked/content_aware_chunking/_create_a_model/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..efd73604c994cafe1894529a451f4d1d30d78d90 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_68.txt @@ -0,0 +1 @@ +Start by initializing a backbone in the model config and specify whether you want to load pretrained weights or load randomly initialized weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_69.txt b/chunked/content_aware_chunking/_create_a_model/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..50269d0264cd0f2f18db2c731ea05f4c04a56921 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_69.txt @@ -0,0 +1 @@ +Then you can pass the model config to the model head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_70.txt b/chunked/content_aware_chunking/_create_a_model/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f1867242ee720617a6f4868172cee13fbd10e9f --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_70.txt @@ -0,0 +1,3 @@ +For example, to load a ResNet backbone into a MaskFormer model with an instance segmentation head: + +Set use_pretrained_backbone=True to load pretrained ResNet weights for the backbone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_71.txt b/chunked/content_aware_chunking/_create_a_model/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..a072ca40d62c804dbb9464fe72de5c24ed0c2840 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_71.txt @@ -0,0 +1,5 @@ +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig +config = MaskFormerConfig(backbone="microsoft/resnet50", use_pretrained_backbone=True) # backbone and neck config +model = MaskFormerForInstanceSegmentation(config) # head + +You could also load the backbone config separately and then pass it to the model config. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_72.txt b/chunked/content_aware_chunking/_create_a_model/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..45574163f8f4d896a56b3f9e35919598584b8fa0 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_72.txt @@ -0,0 +1,6 @@ +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig +backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50") +config = MaskFormerConfig(backbone_config=backbone_config) +model = MaskFormerForInstanceSegmentation(config) + +Set use_pretrained_backbone=False to randomly initialize a ResNet backbone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_73.txt b/chunked/content_aware_chunking/_create_a_model/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..34c8314663b409825911fa0e108bbb7765b4ce56 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_73.txt @@ -0,0 +1,5 @@ +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig +config = MaskFormerConfig(backbone="microsoft/resnet50", use_pretrained_backbone=False) # backbone and neck config +model = MaskFormerForInstanceSegmentation(config) # head + +You could also load the backbone config separately and then pass it to the model config. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_74.txt b/chunked/content_aware_chunking/_create_a_model/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3eac9355e114097891b5f1a11b8e322fee0f459 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_74.txt @@ -0,0 +1,6 @@ +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig +backbone_config = ResNetConfig() +config = MaskFormerConfig(backbone_config=backbone_config) +model = MaskFormerForInstanceSegmentation(config) + +timm models are loaded with [TimmBackbone] and [TimmBackboneConfig]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_75.txt b/chunked/content_aware_chunking/_create_a_model/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..7557a4572bdb17aab6b132e76aa9fe5f5c26340a --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_75.txt @@ -0,0 +1,7 @@ +thon +from transformers import TimmBackboneConfig, TimmBackbone +backbone_config = TimmBackboneConfig("resnet50") +model = TimmBackbone(config=backbone_config) + +Feature extractor +A feature extractor processes audio inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_76.txt b/chunked/content_aware_chunking/_create_a_model/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a4e4522eea3431f74b8a1a1bd7a8eb7ae5cb429 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_76.txt @@ -0,0 +1 @@ +It inherits from the base [~feature_extraction_utils.FeatureExtractionMixin] class, and may also inherit from the [SequenceFeatureExtractor] class for processing audio inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_77.txt b/chunked/content_aware_chunking/_create_a_model/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0db70c70af87960c692e3234e7ee1288e76e04d --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_77.txt @@ -0,0 +1 @@ +To use, create a feature extractor associated with the model you're using. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_78.txt b/chunked/content_aware_chunking/_create_a_model/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..737642505989a51406dc64fd36321970c78a42b3 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_78.txt @@ -0,0 +1,16 @@ +For example, create a default [Wav2Vec2FeatureExtractor] if you are using Wav2Vec2 for audio classification: + +from transformers import Wav2Vec2FeatureExtractor +w2v2_extractor = Wav2Vec2FeatureExtractor() +print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} + +If you aren't looking for any customization, just use the from_pretrained method to load a model's default feature extractor parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_79.txt b/chunked/content_aware_chunking/_create_a_model/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..36430d5013e18f37af7f953c682e1bf8009dce7c --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_79.txt @@ -0,0 +1,17 @@ +Modify any of the [Wav2Vec2FeatureExtractor] parameters to create your custom feature extractor: + +from transformers import Wav2Vec2FeatureExtractor +w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False) +print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": false, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 8000 +} + +Processor +For models that support multimodal tasks, 🤗 Transformers offers a processor class that conveniently wraps processing classes such as a feature extractor and a tokenizer into a single object. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_80.txt b/chunked/content_aware_chunking/_create_a_model/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..328b2b200f3504090808e7af6c780383277d724e --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_80.txt @@ -0,0 +1 @@ +For example, let's use the [Wav2Vec2Processor] for an automatic speech recognition task (ASR). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_81.txt b/chunked/content_aware_chunking/_create_a_model/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..f42148c21a49296898ac1881a1c92781225189cd --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_81.txt @@ -0,0 +1 @@ +ASR transcribes audio to text, so you will need a feature extractor and a tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_82.txt b/chunked/content_aware_chunking/_create_a_model/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..76b4f2ea4427864ba2901a22e7eeeeefe82bb3fb --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_82.txt @@ -0,0 +1,16 @@ +Create a feature extractor to handle the audio inputs: + +from transformers import Wav2Vec2FeatureExtractor +feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True) + +Create a tokenizer to handle the text inputs: + +from transformers import Wav2Vec2CTCTokenizer +tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt") + +Combine the feature extractor and tokenizer in [Wav2Vec2Processor]: + +from transformers import Wav2Vec2Processor +processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) + +With two basic classes - configuration and model - and an additional preprocessing class (tokenizer, image processor, feature extractor, or processor), you can create any of the models supported by 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_83.txt b/chunked/content_aware_chunking/_create_a_model/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cf69efa0969d879d6f53f128192efef1c441a68 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_83.txt @@ -0,0 +1 @@ +Each of these base classes are configurable, allowing you to use the specific attributes you want. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_create_a_model/chunk_84.txt b/chunked/content_aware_chunking/_create_a_model/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..364b7db36cfd45d9cd27e54248c9693fd99e9473 --- /dev/null +++ b/chunked/content_aware_chunking/_create_a_model/chunk_84.txt @@ -0,0 +1 @@ +You can easily setup a model for training or modify an existing pretrained model to fine-tune. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_24.txt b/chunked/content_aware_chunking/_custom_models/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..b07f93b4f634fecf07da6fb4ab8334e23ba854c7 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_24.txt @@ -0,0 +1,3 @@ +We will actually write two: one that +extracts the hidden features from a batch of images (like [BertModel]) and one that is suitable for image +classification (like [BertForSequenceClassification]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_25.txt b/chunked/content_aware_chunking/_custom_models/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..457d18aab49143ae21b9a24844cc34083d53b3d6 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_25.txt @@ -0,0 +1 @@ +As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_26.txt b/chunked/content_aware_chunking/_custom_models/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..51b823ce764c4df1d67b339cfb102d00fe4c8f16 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_26.txt @@ -0,0 +1,2 @@ +The only +thing we need to do before writing this class is a map between the block types and actual block classes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_27.txt b/chunked/content_aware_chunking/_custom_models/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ab10b738ac7a2e5741d666ba6ffe333843c782d --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_27.txt @@ -0,0 +1,56 @@ +Then the +model is defined from the configuration by passing everything to the ResNet class: + +from transformers import PreTrainedModel +from timm.models.resnet import BasicBlock, Bottleneck, ResNet +from .configuration_resnet import ResnetConfig +BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck} +class ResnetModel(PreTrainedModel): + config_class = ResnetConfig +def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + +def forward(self, tensor): + return self.model.forward_features(tensor) + +For the model that will classify images, we just change the forward method: + +import torch +class ResnetModelForImageClassification(PreTrainedModel): + config_class = ResnetConfig +def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + +def forward(self, tensor, labels=None): + logits = self.model(tensor) + if labels is not None: + loss = torch.nn.cross_entropy(logits, labels) + return {"loss": loss, "logits": logits} + return {"logits": logits} + +In both cases, notice how we inherit from PreTrainedModel and call the superclass initialization with the config +(a bit like when you write a regular torch.nn.Module). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_28.txt b/chunked/content_aware_chunking/_custom_models/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e0909605cf7389d06b581131205d840dae88330 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_28.txt @@ -0,0 +1,2 @@ +The line that sets the config_class is not mandatory, unless +you want to register your model with the auto classes (see last section). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_29.txt b/chunked/content_aware_chunking/_custom_models/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..b20cdd3718fd60dfa9d21f5094dc1984b3716887 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_29.txt @@ -0,0 +1 @@ +If your model is very similar to a model inside the library, you can re-use the same configuration as this model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_30.txt b/chunked/content_aware_chunking/_custom_models/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..7833b881b0145203b829defacfef1d06b0c5d4d8 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_30.txt @@ -0,0 +1,3 @@ +You can have your model return anything you want, but returning a dictionary like we did for +ResnetModelForImageClassification, with the loss included when labels are passed, will make your model directly +usable inside the [Trainer] class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_31.txt b/chunked/content_aware_chunking/_custom_models/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2bf94fcf9966f93a60b6b7b24f2b2fb74730da8 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_31.txt @@ -0,0 +1,2 @@ +Using another output format is fine as long as you are planning on using your own +training loop or another library for training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_32.txt b/chunked/content_aware_chunking/_custom_models/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb301901bcff31c952758b5deecf0367c25cc283 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_32.txt @@ -0,0 +1,5 @@ +Now that we have our model class, let's create one: +py +resnet50d = ResnetModelForImageClassification(resnet50d_config) +Again, you can use any of the methods of [PreTrainedModel], like [~PreTrainedModel.save_pretrained] or +[~PreTrainedModel.push_to_hub]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_33.txt b/chunked/content_aware_chunking/_custom_models/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..963035269bdab3ec7c8cfd0939d6fe74a4b23d98 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_33.txt @@ -0,0 +1,2 @@ +We will use the second in the next section, and see how to push the model weights +with the code of our model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_34.txt b/chunked/content_aware_chunking/_custom_models/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cd21e5dcee9afff9ad1615db02a62e9a58cb7f2 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_34.txt @@ -0,0 +1 @@ +But first, let's load some pretrained weights inside our model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_35.txt b/chunked/content_aware_chunking/_custom_models/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdbfc219357dc4715f57155066f040c926a0b898 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_35.txt @@ -0,0 +1 @@ +In your own use case, you will probably be training your custom model on your own data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_36.txt b/chunked/content_aware_chunking/_custom_models/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cb2eb2ed8b421904e3d796705bc71a6dc030a0e --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_36.txt @@ -0,0 +1,2 @@ +To go fast for this tutorial, +we will use the pretrained version of the resnet50d. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_37.txt b/chunked/content_aware_chunking/_custom_models/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..186c076255dcb122192626a852dd44e4c8db8402 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_37.txt @@ -0,0 +1,9 @@ +Since our model is just a wrapper around it, it's going to be +easy to transfer those weights: + +import timm +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) + +Now let's see how to make sure that when we do [~PreTrainedModel.save_pretrained] or [~PreTrainedModel.push_to_hub], the +code of the model is saved. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_38.txt b/chunked/content_aware_chunking/_custom_models/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..14fceb13fe41f7b9caed0bf6915f1bc019342057 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_38.txt @@ -0,0 +1,3 @@ +Registering a model with custom code to the auto classes +If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own +model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_39.txt b/chunked/content_aware_chunking/_custom_models/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fc6f2baf40b37d4f6265a680fbf7ffdb962fe39 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_39.txt @@ -0,0 +1,2 @@ +This is different from pushing the code to the Hub in the sense that users will need to import your library to +get the custom models (contrarily to automatically downloading the model code from the Hub). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_40.txt b/chunked/content_aware_chunking/_custom_models/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..0696d7f2f09103780e4178ae89305c5125b1bf56 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_40.txt @@ -0,0 +1,11 @@ +As long as your config has a model_type attribute that is different from existing model types, and that your model +classes have the right config_class attributes, you can just add them to the auto classes like this: + +from transformers import AutoConfig, AutoModel, AutoModelForImageClassification +AutoConfig.register("resnet", ResnetConfig) +AutoModel.register(ResnetConfig, ResnetModel) +AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) + +Note that the first argument used when registering your custom config to [AutoConfig] needs to match the model_type +of your custom config, and the first argument used when registering your custom models to any auto model class needs +to match the config_class of those models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_41.txt b/chunked/content_aware_chunking/_custom_models/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba57f250eda4571e97662cb99d5f94309d244991 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_41.txt @@ -0,0 +1,3 @@ +Sending the code to the Hub + +This API is experimental and may have some slight breaking changes in the next releases. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_42.txt b/chunked/content_aware_chunking/_custom_models/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce40a6b5e23218b8deab6b8bdeff852f2bd72fd2 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_42.txt @@ -0,0 +1 @@ +First, make sure your model is fully defined in a .py file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_43.txt b/chunked/content_aware_chunking/_custom_models/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..39de8ca09a502833270896b7fca89d909b093f0e --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_43.txt @@ -0,0 +1,2 @@ +It can rely on relative imports to some other files as +long as all the files are in the same directory (we don't support submodules for this feature yet). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_44.txt b/chunked/content_aware_chunking/_custom_models/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..92fa01477bdd674b531610c5788b2661624bcad9 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_44.txt @@ -0,0 +1,3 @@ +For our example, +we'll define a modeling_resnet.py file and a configuration_resnet.py file in a folder of the current working +directory named resnet_model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_45.txt b/chunked/content_aware_chunking/_custom_models/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..007e57cfacc8ab822f13d656251b1221d9b83c6d --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_45.txt @@ -0,0 +1,2 @@ +The configuration file contains the code for ResnetConfig and the modeling file +contains the code of ResnetModel and ResnetModelForImageClassification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_46.txt b/chunked/content_aware_chunking/_custom_models/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..945c9b46d684f08ec84cb316e1dc0061e361f794 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_46.txt @@ -0,0 +1 @@ +. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_47.txt b/chunked/content_aware_chunking/_custom_models/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..34ee01ffe2bb80de1cac90f577c974f381c70af8 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_47.txt @@ -0,0 +1,5 @@ +└── resnet_model + ├── __init__.py + ├── configuration_resnet.py + └── modeling_resnet.py +The __init__.py can be empty, it's just there so that Python detects resnet_model can be use as a module. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_48.txt b/chunked/content_aware_chunking/_custom_models/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..57dcb6e18bf66b576589b7b1eeb612d21eb8ce54 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_48.txt @@ -0,0 +1,2 @@ +If copying a modeling files from the library, you will need to replace all the relative imports at the top of the file +to import from the transformers package. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_49.txt b/chunked/content_aware_chunking/_custom_models/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b6b528398412aa207368e51d601e84a2115675f --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_49.txt @@ -0,0 +1 @@ +Note that you can re-use (or subclass) an existing configuration/model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_50.txt b/chunked/content_aware_chunking/_custom_models/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..c68b29a2447b14ec6938fbbec335d5d2df3a48b2 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_50.txt @@ -0,0 +1,13 @@ +To share your model with the community, follow those steps: first import the ResNet model and config from the newly +created files: +py +from resnet_model.configuration_resnet import ResnetConfig +from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification +Then you have to tell the library you want to copy the code files of those objects when using the save_pretrained +method and properly register them with a given Auto class (especially for models), just run: +py +ResnetConfig.register_for_auto_class() +ResnetModel.register_for_auto_class("AutoModel") +ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification") +Note that there is no need to specify an auto class for the configuration (there is only one auto class for them, +[AutoConfig]) but it's different for models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_51.txt b/chunked/content_aware_chunking/_custom_models/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0eacce0e500f41d1dfaf752c9672f21ea287f03 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_51.txt @@ -0,0 +1,2 @@ +Your custom model could be suitable for many different tasks, so you +have to specify which one of the auto classes is the correct one for your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_52.txt b/chunked/content_aware_chunking/_custom_models/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..36e30b2bbbd20765ea2ca934bd5ebe3917807263 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_52.txt @@ -0,0 +1 @@ +Use register_for_auto_class() if you want the code files to be copied. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_53.txt b/chunked/content_aware_chunking/_custom_models/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..83f2badedda0c460ad2e193dbb49f537cec58ce6 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_53.txt @@ -0,0 +1,2 @@ +If you instead prefer to use code on the Hub from another repo, +you don't need to call it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_54.txt b/chunked/content_aware_chunking/_custom_models/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..14f703570299acd4ac390bff49164e43403a62d6 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_54.txt @@ -0,0 +1,17 @@ +In cases where there's more than one auto class, you can modify the config.json directly using the +following structure: +json +"auto_map": { + "AutoConfig": "--", + "AutoModel": "--", + "AutoModelFor": "--", +}, + +Next, let's create the config and models as we did before: + +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d = ResnetModelForImageClassification(resnet50d_config) +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) + +Now to send the model to the Hub, make sure you are logged in. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_55.txt b/chunked/content_aware_chunking/_custom_models/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f82fc911849b1b3aa24a68d2a656fe24293d8bc --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_55.txt @@ -0,0 +1,13 @@ +Either run in your terminal: + +huggingface-cli login +or from a notebook: + +from huggingface_hub import notebook_login +notebook_login() + +You can then push to your own namespace (or an organization you are a member of) like this: +py +resnet50d.push_to_hub("custom-resnet50d") +On top of the modeling weights and the configuration in json format, this also copied the modeling and +configuration .py files in the folder custom-resnet50d and uploaded the result to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_56.txt b/chunked/content_aware_chunking/_custom_models/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bf7563c2086bad8e6beebd33692326e99da78e1 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_56.txt @@ -0,0 +1,2 @@ +You can check the result +in this model repo. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_57.txt b/chunked/content_aware_chunking/_custom_models/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..576aef6eb6dced784bb27480e6ea5fc800e97554 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_57.txt @@ -0,0 +1 @@ +See the sharing tutorial for more information on the push to Hub method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_58.txt b/chunked/content_aware_chunking/_custom_models/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..c865df126bdda61cf553e1fdc6a181f0c5be83b0 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_58.txt @@ -0,0 +1,3 @@ +Using a model with custom code +You can use any configuration, model or tokenizer with custom code files in its repository with the auto-classes and +the from_pretrained method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_59.txt b/chunked/content_aware_chunking/_custom_models/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0da27216e9cd7b3debe9fd461e48dad75967ec8 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_59.txt @@ -0,0 +1,2 @@ +All files and code uploaded to the Hub are scanned for malware (refer to the Hub security documentation for more information), but you should still +review the model code and author to avoid executing malicious code on your machine. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_60.txt b/chunked/content_aware_chunking/_custom_models/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..ceee2c3cf869986ad1edb7618526fe8c722478bd --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_60.txt @@ -0,0 +1,8 @@ +Set trust_remote_code=True to use +a model with custom code: + +from transformers import AutoModelForImageClassification +model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) + +It is also strongly encouraged to pass a commit hash as a revision to make sure the author of the models did not +update the code with some malicious new lines (unless you fully trust the authors of the models). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_models/chunk_61.txt b/chunked/content_aware_chunking/_custom_models/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd2ed8d4ff320606a803462caba3b1faa579c114 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_models/chunk_61.txt @@ -0,0 +1,7 @@ +py +commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" +model = AutoModelForImageClassification.from_pretrained( + "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash +) +Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit +hash of any commit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_100.txt b/chunked/content_aware_chunking/_custom_tools/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c3d6db429c8df1718ba6b37d3895a7008625116 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_100.txt @@ -0,0 +1 @@ +We can help the agent here by changing the tool name and description of image_transformer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_101.txt b/chunked/content_aware_chunking/_custom_tools/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..f81bf0689fa0d7a1e321d31a621c240064fac3bd --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_101.txt @@ -0,0 +1,8 @@ +Let's instead call it modifier +to disassociate it a bit from "image" and "prompt": +py +agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer") +agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace( + "transforms an image according to a prompt", "modifies an image" +) +Now "modify" is a strong cue to use the new image processor which should help with the above prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_102.txt b/chunked/content_aware_chunking/_custom_tools/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccc9f146e5d69850e671753cca47b091a62022d0 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_102.txt @@ -0,0 +1 @@ +Let's run it again. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_103.txt b/chunked/content_aware_chunking/_custom_tools/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..9dc056c2bf3abe29494a280fd63cf5780c6b39f9 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_103.txt @@ -0,0 +1,6 @@ +py +agent.run("Make an image of a house and a car", return_code=True) +Now we're getting: +``text +==Explanation from the agent== +I will use the following tools:image_generatorto generate an image of a house, thenimage_generator` to generate an image of a car. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_104.txt b/chunked/content_aware_chunking/_custom_tools/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..784f408dfebfdecffeb85e8e056641769dee14e2 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_104.txt @@ -0,0 +1,5 @@ +==Code generated by the agent== +house_image = image_generator(prompt="A house") +car_image = image_generator(prompt="A car") + +which is definitely closer to what we had in mind! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_105.txt b/chunked/content_aware_chunking/_custom_tools/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..f131a2088d774566a2db262d70baf6273a0eb3eb --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_105.txt @@ -0,0 +1 @@ +However, we want to have both the house and car in the same image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_106.txt b/chunked/content_aware_chunking/_custom_tools/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8062b9e755fe6f114b483031b2360545f59b75b --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_106.txt @@ -0,0 +1,6 @@ +Steering the task more toward single image generation should help: +py +agent.run("Create image: 'A house and car'", return_code=True) +``text +==Explanation from the agent== +I will use the following tool:image_generator` to generate an image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_107.txt b/chunked/content_aware_chunking/_custom_tools/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..63dce6486ef2371291a50a884e478aa462fb0ee2 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_107.txt @@ -0,0 +1,5 @@ +==Code generated by the agent== +image = image_generator(prompt="A house and car") + +Agents are still brittle for many use cases, especially when it comes to +slightly more complex use cases like generating an image of multiple objects. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_108.txt b/chunked/content_aware_chunking/_custom_tools/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..23d48d3cfc4db4d24d831421abe9ac3b4cd05112 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_108.txt @@ -0,0 +1,2 @@ +Both the agent itself and the underlying prompt will be further improved in the coming +months making sure that agents become more robust to a variety of user inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_109.txt b/chunked/content_aware_chunking/_custom_tools/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f9399c4a8007c74ac5d1caec3ffc68dbe9f27aa --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_109.txt @@ -0,0 +1,3 @@ +Customizing the whole prompt +To give the user maximum flexibility, the whole prompt template as explained in above +can be overwritten by the user. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_110.txt b/chunked/content_aware_chunking/_custom_tools/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd0cf33a8595f48685734b3b9c83f081ea19007e --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_110.txt @@ -0,0 +1,2 @@ +In this case make sure that your custom prompt includes an introduction section, +a tool section, an example section, and an unfinished example section. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_111.txt b/chunked/content_aware_chunking/_custom_tools/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa0a8c9e4c99867cfe6f57c59992f70727cb0985 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_111.txt @@ -0,0 +1,8 @@ +If you want to overwrite the run prompt template, +you can do as follows: + +template = """ [] """ +agent = HfAgent(your_endpoint, run_prompt_template=template) + +Please make sure to have the <> string and the <> defined somewhere in the template so that the agent can be aware +of the tools, it has available to it as well as correctly insert the user's prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_112.txt b/chunked/content_aware_chunking/_custom_tools/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b015696944745a5be2c794c91083c2d2bbba471 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_112.txt @@ -0,0 +1 @@ +Similarly, one can overwrite the chat prompt template. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_113.txt b/chunked/content_aware_chunking/_custom_tools/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7b8be6b2447ef1ce2d45bbb14911f4ebced6e31 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_113.txt @@ -0,0 +1,6 @@ +Note that the chat mode always uses the following format for the exchanges: +```text +Human: <> +Assistant: + +Therefore it is important that the examples of the custom chat prompt template also make use of this format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_114.txt b/chunked/content_aware_chunking/_custom_tools/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f7e47039b4da9bfe59dd1f8ccafb8c157ffaefb --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_114.txt @@ -0,0 +1 @@ +You can overwrite the chat template at instantiation as follows. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_115.txt b/chunked/content_aware_chunking/_custom_tools/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac95de28060b781134710ebb9908b98aab8b153e --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_115.txt @@ -0,0 +1,6 @@ +thon +template = """ [] """ +agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template) + +Please make sure to have the <> string defined somewhere in the template so that the agent can be aware +of the tools, it has available to it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_116.txt b/chunked/content_aware_chunking/_custom_tools/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd93e541d0b272d56fd3b61c12017194a3537d85 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_116.txt @@ -0,0 +1 @@ +In both cases, you can pass a repo ID instead of the prompt template if you would like to use a template hosted by someone in the community. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_117.txt b/chunked/content_aware_chunking/_custom_tools/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c00ebc7bc8a1febe433553e10f1cbfad0e625b8 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_117.txt @@ -0,0 +1 @@ +The default prompts live in this repo as an example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_118.txt b/chunked/content_aware_chunking/_custom_tools/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce78dea7b44494ab4bb1f183544992bd41dfa059 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_118.txt @@ -0,0 +1,10 @@ +To upload your custom prompt on a repo on the Hub and share it with the community just make sure: +- to use a dataset repository +- to put the prompt template for the run command in a file named run_prompt_template.txt +- to put the prompt template for the chat command in a file named chat_prompt_template.txt +Using custom tools +In this section, we'll be leveraging two existing custom tools that are specific to image generation: + +We replace huggingface-tools/image-transformation, + with diffusers/controlnet-canny-tool + to allow for more image modifications. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_119.txt b/chunked/content_aware_chunking/_custom_tools/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc7a63fc0c833a94cf4553d5feeaa9335e9959c3 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_119.txt @@ -0,0 +1,2 @@ +We add a new tool for image upscaling to the default toolbox: + diffusers/latent-upscaler-tool replace the existing image-transformation tool. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_120.txt b/chunked/content_aware_chunking/_custom_tools/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8eff38f88c1c01c2c4740c577f0f23bbd6d6f27 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_120.txt @@ -0,0 +1,8 @@ +We'll start by loading the custom tools with the convenient [load_tool] function: + +from transformers import load_tool +controlnet_transformer = load_tool("diffusers/controlnet-canny-tool") +upscaler = load_tool("diffusers/latent-upscaler-tool") + +Upon adding custom tools to an agent, the tools' descriptions and names are automatically +included in the agents' prompts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_121.txt b/chunked/content_aware_chunking/_custom_tools/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..f74b02672c2baab94870a4f1d2cc1d23938ea16d --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_121.txt @@ -0,0 +1,2 @@ +Thus, it is imperative that custom tools have +a well-written description and name in order for the agent to understand how to use them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_122.txt b/chunked/content_aware_chunking/_custom_tools/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e49db1b66d4f0aa698222cd6d2cc542f8d4a2cd --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_122.txt @@ -0,0 +1,7 @@ +Let's take a look at the description and name of controlnet_transformer: +py +print(f"Description: '{controlnet_transformer.description}'") +print(f"Name: '{controlnet_transformer.name}'") +gives +text +Description: 'This is a tool that transforms an image with ControlNet according to a prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_123.txt b/chunked/content_aware_chunking/_custom_tools/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..0040646ecccd126cd51780fbf5b3404aa4793b2c --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_123.txt @@ -0,0 +1 @@ +It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_124.txt b/chunked/content_aware_chunking/_custom_tools/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..5158cc25255d432898e514acdf2397772be5c62e --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_124.txt @@ -0,0 +1 @@ +It returns the modified image.' \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_125.txt b/chunked/content_aware_chunking/_custom_tools/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..b66eaab8a7003f66847aa83a00016972a6c7e997 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_125.txt @@ -0,0 +1,2 @@ +Name: 'image_transformer' +The name and description are accurate and fit the style of the curated set of tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_126.txt b/chunked/content_aware_chunking/_custom_tools/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..11620ca42b33c06cbafeab678120d4e9e3f21c23 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_126.txt @@ -0,0 +1,9 @@ +Next, let's instantiate an agent with controlnet_transformer and upscaler: +py +tools = [controlnet_transformer, upscaler] +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools) +This command should give you the following info: +text +image_transformer has been replaced by as provided in `additional_tools` +The set of curated tools already has an image_transformer tool which is hereby replaced with our custom tool. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_127.txt b/chunked/content_aware_chunking/_custom_tools/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..32c1635554b42a557fdd2d2b38875974a6be286c --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_127.txt @@ -0,0 +1,2 @@ +Overwriting existing tools can be beneficial if we want to use a custom tool exactly for the same task as an existing tool +because the agent is well-versed in using the specific task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_128.txt b/chunked/content_aware_chunking/_custom_tools/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff05d761cd16912fed74122f055014d12993e428 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_128.txt @@ -0,0 +1,3 @@ +Beware that the custom tool should follow the exact same API +as the overwritten tool in this case, or you should adapt the prompt template to make sure all examples using that +tool are updated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_129.txt b/chunked/content_aware_chunking/_custom_tools/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f5c3eb464ff6abca18a486db6f166b7bf6919a3 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_129.txt @@ -0,0 +1 @@ +The upscaler tool was given the name image_upscaler which is not yet present in the default toolbox and is therefore simply added to the list of tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_130.txt b/chunked/content_aware_chunking/_custom_tools/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..df99f0e4433adf993a2c811d47d32d965f25cfd4 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_130.txt @@ -0,0 +1,20 @@ +You can always have a look at the toolbox that is currently available to the agent via the agent.toolbox attribute: +py +print("\n".join([f"- {a}" for a in agent.toolbox.keys()])) +text +- document_qa +- image_captioner +- image_qa +- image_segmenter +- transcriber +- summarizer +- text_classifier +- text_qa +- text_reader +- translator +- image_transformer +- text_downloader +- image_generator +- video_generator +- image_upscaler +Note how image_upscaler is now part of the agents' toolbox. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_131.txt b/chunked/content_aware_chunking/_custom_tools/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..c56ff38ad75bcde8185216c62ef635fdd51409f2 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_131.txt @@ -0,0 +1 @@ +Let's now try out the new tools! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_132.txt b/chunked/content_aware_chunking/_custom_tools/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a92c1c3540f4662c2f8a18c2df2a940c49298e6 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_132.txt @@ -0,0 +1 @@ +We will re-use the image we generated in Transformers Agents Quickstart. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_133.txt b/chunked/content_aware_chunking/_custom_tools/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..7392bddbcfac83a42d238e771f4b496e9a355c68 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_133.txt @@ -0,0 +1,12 @@ +from diffusers.utils import load_image +image = load_image( + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" +) + + +Let's transform the image into a beautiful winter landscape: +py +image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image) +``text +==Explanation from the agent== +I will use the following tool:image_transformer` to transform the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_134.txt b/chunked/content_aware_chunking/_custom_tools/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..a999c6f26af1386d84def9bce00f2dd8a0911f8b --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_134.txt @@ -0,0 +1,5 @@ +==Code generated by the agent== +image = image_transformer(image, prompt="A frozen lake and snowy forest") + + +The new image processing tool is based on ControlNet which can make very strong modifications to the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_135.txt b/chunked/content_aware_chunking/_custom_tools/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..61090f858e5579f2dc73cd3af48d6c078a0d6186 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_135.txt @@ -0,0 +1 @@ +By default the image processing tool returns an image of size 512x512 pixels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_136.txt b/chunked/content_aware_chunking/_custom_tools/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..be9dc045ca1aa2a3ac6bc2651002c242e7ba2d2a --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_136.txt @@ -0,0 +1 @@ +Let's see if we can upscale it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_137.txt b/chunked/content_aware_chunking/_custom_tools/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8749ddf6a9024530d69fdcc678a0c6bd6671e0b --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_137.txt @@ -0,0 +1,5 @@ +py +image = agent.run("Upscale the image", image) +``text +==Explanation from the agent== +I will use the following tool:image_upscaler` to upscale the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_138.txt b/chunked/content_aware_chunking/_custom_tools/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6aee9ed429b1584b1e329f20e439417ffe9ea18 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_138.txt @@ -0,0 +1,6 @@ +==Code generated by the agent== +upscaled_image = image_upscaler(image) + + +The agent automatically mapped our prompt "Upscale the image" to the just added upscaler tool purely based on the description and name of the upscaler tool +and was able to correctly run it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_139.txt b/chunked/content_aware_chunking/_custom_tools/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa898eafcc8dc4387cf599fb1163907e5ac9da70 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_139.txt @@ -0,0 +1 @@ +Next, let's have a look at how you can create a new custom tool. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_140.txt b/chunked/content_aware_chunking/_custom_tools/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e9efb749fc698fb310035fc3760bd3fb8196cea --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_140.txt @@ -0,0 +1,2 @@ +Adding new tools +In this section, we show how to create a new tool that can be added to the agent. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_141.txt b/chunked/content_aware_chunking/_custom_tools/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c11212837eeba3b661d65eda101e48df09c883f --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_141.txt @@ -0,0 +1,2 @@ +Creating a new tool +We'll first start by creating a tool. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_142.txt b/chunked/content_aware_chunking/_custom_tools/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..227229d3990650c6f70ef0bdf49e5ecd6f9a77a9 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_142.txt @@ -0,0 +1,2 @@ +We'll add the not-so-useful yet fun task of fetching the model on the Hugging Face +Hub with the most downloads for a given task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_143.txt b/chunked/content_aware_chunking/_custom_tools/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..db453e4704233e8665f5a3b927e7ce15c0239497 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_143.txt @@ -0,0 +1,8 @@ +We can do that with the following code: +thon +from huggingface_hub import list_models +task = "text-classification" +model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) +print(model.id) + +For the task text-classification, this returns 'facebook/bart-large-mnli', for translation it returns 'google-t5/t5-base. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_144.txt b/chunked/content_aware_chunking/_custom_tools/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..16059592c1a7d31c04ac8d277aac96ac75afc9e1 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_144.txt @@ -0,0 +1 @@ +How do we convert this to a tool that the agent can leverage? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_145.txt b/chunked/content_aware_chunking/_custom_tools/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e3219ba88fe28bbeec86600c899bde6bb9f13a9 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_145.txt @@ -0,0 +1,2 @@ +All tools depend on the superclass Tool that holds the +main attributes necessary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_146.txt b/chunked/content_aware_chunking/_custom_tools/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f02ad6fd309b047ec2b7e66382bd77a31b4eef5 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_146.txt @@ -0,0 +1,8 @@ +We'll create a class that inherits from it: +thon +from transformers import Tool +class HFModelDownloadsTool(Tool): + pass + +This class has a few needs: +- An attribute name, which corresponds to the name of the tool itself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_147.txt b/chunked/content_aware_chunking/_custom_tools/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbc5e02112904ae8deba41131eac932a6fcbe490 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_147.txt @@ -0,0 +1,2 @@ +To be in tune with other tools which have a + performative name, we'll name it model_download_counter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_148.txt b/chunked/content_aware_chunking/_custom_tools/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..56863af689cec71e5c7e066ef013c46a9e1a9d5d --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_148.txt @@ -0,0 +1 @@ +- An attribute description, which will be used to populate the prompt of the agent. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_149.txt b/chunked/content_aware_chunking/_custom_tools/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..77476cafe2d93c31a6e279359e2f04a58097fcd6 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_149.txt @@ -0,0 +1 @@ +- inputs and outputs attributes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_150.txt b/chunked/content_aware_chunking/_custom_tools/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff9deef77bebdcea5ed50e957bc8c4b47c75f300 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_150.txt @@ -0,0 +1,2 @@ +Defining this will help the python interpreter make educated choices about types, + and will allow for a gradio-demo to be spawned when we push our tool to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_151.txt b/chunked/content_aware_chunking/_custom_tools/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..0042fbca5f8af7a8fec966a29bdca3187fbf0804 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_151.txt @@ -0,0 +1,2 @@ +They're both a list of expected + values, which can be text, image, or audio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_152.txt b/chunked/content_aware_chunking/_custom_tools/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..e97aabcfaaf39ea0a9fee9bed912ffb121747de2 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_152.txt @@ -0,0 +1 @@ +- A __call__ method which contains the inference code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_153.txt b/chunked/content_aware_chunking/_custom_tools/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f8343bb6ab911af649f486fa2f49dda42b9c5f3 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_153.txt @@ -0,0 +1 @@ +This is the code we've played with above! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_154.txt b/chunked/content_aware_chunking/_custom_tools/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4a7f78c94ba6498b6e005cde26f3b6de4a9231c --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_154.txt @@ -0,0 +1,8 @@ +Here's what our class looks like now: +thon +from transformers import Tool +from huggingface_hub import list_models +class HFModelDownloadsTool(Tool): + name = "model_download_counter" + description = ( + "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. " \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_155.txt b/chunked/content_aware_chunking/_custom_tools/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..734c8f349706176556b800de4103f47b5f5a3f05 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_155.txt @@ -0,0 +1,2 @@ +"It takes the name of the category (such as text-classification, depth-estimation, etc), and " + "returns the name of the checkpoint." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_156.txt b/chunked/content_aware_chunking/_custom_tools/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..09efbcb7da483dd41db003f830444b5786d28362 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_156.txt @@ -0,0 +1,9 @@ +) +inputs = ["text"] +outputs = ["text"] + +def __call__(self, task: str): + model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) + return model.id + +We now have our tool handy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_157.txt b/chunked/content_aware_chunking/_custom_tools/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..90ea5bfb8e2dd3e543ed03ca57bff9aeab387a38 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_157.txt @@ -0,0 +1 @@ +Save it in a file and import it from your main script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_158.txt b/chunked/content_aware_chunking/_custom_tools/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a2c69f47a96dfa3d478c0352fd37beccb6caca5 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_158.txt @@ -0,0 +1,8 @@ +Let's name this file +model_downloads.py, so the resulting import code looks like this: +thon +from model_downloads import HFModelDownloadsTool +tool = HFModelDownloadsTool() + +In order to let others benefit from it and for simpler initialization, we recommend pushing it to the Hub under your +namespace. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_159.txt b/chunked/content_aware_chunking/_custom_tools/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b48caae3bb87b96b13f7c8b70bd809ecdcd5826 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_159.txt @@ -0,0 +1,4 @@ +To do so, just call push_to_hub on the tool variable: +python +tool.push_to_hub("hf-model-downloads") +You now have your code on the Hub! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_160.txt b/chunked/content_aware_chunking/_custom_tools/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..8503cb9323ff2d347a3a55394eb12ff44c58377c --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_160.txt @@ -0,0 +1 @@ +Let's take a look at the final step, which is to have the agent use it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_161.txt b/chunked/content_aware_chunking/_custom_tools/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3be8096fb7871c6429cdef2a89802da1aacb8be --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_161.txt @@ -0,0 +1,12 @@ +Having the agent use the tool +We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool): +thon +from transformers import load_tool +tool = load_tool("lysandre/hf-model-downloads") + +In order to use it in the agent, simply pass it in the additional_tools parameter of the agent initialization method: +thon +from transformers import HfAgent +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool]) +agent.run( + "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?" \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_162.txt b/chunked/content_aware_chunking/_custom_tools/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..510b1727c14d6c4951a882be4812f145832682c0 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_162.txt @@ -0,0 +1,5 @@ +) +which outputs the following:text +==Code generated by the agent== +model = model_download_counter(task="text-to-video") +print(f"The model with the most downloads is {model}.") \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_163.txt b/chunked/content_aware_chunking/_custom_tools/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a2821ceb1f6e0e2d058c0b3bac9fcf7bd225418 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_163.txt @@ -0,0 +1,3 @@ +audio_model = text_reader(model) +==Result== +The model with the most downloads is damo-vilab/text-to-video-ms-1.7b. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_164.txt b/chunked/content_aware_chunking/_custom_tools/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..175cb23a2f87bca8af3a9e7f9b394b6ef319bf58 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_164.txt @@ -0,0 +1 @@ +and generates the following audio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_165.txt b/chunked/content_aware_chunking/_custom_tools/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b8808dc03aa37d16dd19245d8567f9e3909f299 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_165.txt @@ -0,0 +1,5 @@ +| Audio | +|------------------------------------------------------------------------------------------------------------------------------------------------------| +| | + +Depending on the LLM, some are quite brittle and require very exact prompts in order to work well. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_166.txt b/chunked/content_aware_chunking/_custom_tools/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f972cae920a1273dbf33ce09462e011c0a3730f --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_166.txt @@ -0,0 +1,2 @@ +Having a well-defined +name and description of the tool is paramount to having it be leveraged by the agent. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_167.txt b/chunked/content_aware_chunking/_custom_tools/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a39b1565949dec96a8aebdbb5e21435fb172e30 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_167.txt @@ -0,0 +1,2 @@ +Replacing existing tools +Replacing existing tools can be done simply by assigning a new item to the agent's toolbox. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_168.txt b/chunked/content_aware_chunking/_custom_tools/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..180313b925d4ed7b12efa6f0639d358bd043cd66 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_168.txt @@ -0,0 +1,7 @@ +Here's how one would do so: +thon +from transformers import HfAgent, load_tool +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder") +agent.toolbox["image-transformation"] = load_tool("diffusers/controlnet-canny-tool") + +Beware when replacing tools with others! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_169.txt b/chunked/content_aware_chunking/_custom_tools/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc315c041aa3d3ed9375bded35a2fe61b4c6f3bc --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_169.txt @@ -0,0 +1 @@ +This will also adjust the agent's prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_170.txt b/chunked/content_aware_chunking/_custom_tools/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a94dd3afea4562f6c2a669c85a750f906e6f67e --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_170.txt @@ -0,0 +1,3 @@ +This can be good if you have a better +prompt suited for the task, but it can also result in your tool being selected way more than others or for other +tools to be selected instead of the one you have defined. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_171.txt b/chunked/content_aware_chunking/_custom_tools/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..899714223552df2692fd012d1091eb4a0f386f00 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_171.txt @@ -0,0 +1,3 @@ +Leveraging gradio-tools +gradio-tools is a powerful library that allows using Hugging +Face Spaces as tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_172.txt b/chunked/content_aware_chunking/_custom_tools/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..6326adfdad8dbe3486358d981bcf7c799348ffdc --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_172.txt @@ -0,0 +1 @@ +It supports many existing Spaces as well as custom Spaces to be designed with it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_173.txt b/chunked/content_aware_chunking/_custom_tools/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..da21b39ec93175c18fd3d1f791175c54d1be97ca --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_173.txt @@ -0,0 +1 @@ +We offer support for gradio_tools by using the Tool.from_gradio method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_174.txt b/chunked/content_aware_chunking/_custom_tools/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb7adc607be28f6ec21a27cd6a35e9ceba30fdb8 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_174.txt @@ -0,0 +1,3 @@ +For example, we want to take +advantage of the StableDiffusionPromptGeneratorTool tool offered in the gradio-tools toolkit so as to +improve our prompts and generate better images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_175.txt b/chunked/content_aware_chunking/_custom_tools/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..1abf320ba64651d09dadb41a9d81076815262cf0 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_175.txt @@ -0,0 +1,11 @@ +We first import the tool from gradio_tools and instantiate it: +thon +from gradio_tools import StableDiffusionPromptGeneratorTool +gradio_tool = StableDiffusionPromptGeneratorTool() + +We pass that instance to the Tool.from_gradio method: +thon +from transformers import Tool +tool = Tool.from_gradio(gradio_tool) + +Now we can manage it exactly as we would a usual custom tool. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_176.txt b/chunked/content_aware_chunking/_custom_tools/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6e1b3af162f32403b88401a506740a35d3ed483 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_176.txt @@ -0,0 +1,6 @@ +We leverage it to improve our prompt +a rabbit wearing a space suit: +thon +from transformers import HfAgent +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool]) +agent.run("Generate an image of the prompt after improving it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_177.txt b/chunked/content_aware_chunking/_custom_tools/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..421623f3a93f83bc16d0cd4948fc196d706a93b4 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_177.txt @@ -0,0 +1,6 @@ +", prompt="A rabbit wearing a space suit") + +The model adequately leverages the tool: +``text +==Explanation from the agent== +I will use the following tools:StableDiffusionPromptGeneratorto improve the prompt, thenimage_generator` to generate an image according to the improved prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_178.txt b/chunked/content_aware_chunking/_custom_tools/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..a59a16e179efeab9cdefce2ea6946335502f8034 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_178.txt @@ -0,0 +1,3 @@ +==Code generated by the agent== +improved_prompt = StableDiffusionPromptGenerator(prompt) +print(f"The improved prompt is {improved_prompt}.") \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_179.txt b/chunked/content_aware_chunking/_custom_tools/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0b6ffaa2fac29a109e976859b132fe8f0cd1fb0 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_179.txt @@ -0,0 +1,5 @@ +image = image_generator(improved_prompt) + +Before finally generating the image: + +gradio-tools requires textual inputs and outputs, even when working with different modalities. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_180.txt b/chunked/content_aware_chunking/_custom_tools/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ea3907baf0db4284d4d7ee90173e1c45c05d3dc --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_180.txt @@ -0,0 +1,2 @@ +This implementation +works with image and audio objects. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_181.txt b/chunked/content_aware_chunking/_custom_tools/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..986021f7c22df3ea11092631726e6920b07a149a --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_181.txt @@ -0,0 +1,2 @@ +The two are currently incompatible, but will rapidly become compatible as we +work to improve the support. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_182.txt b/chunked/content_aware_chunking/_custom_tools/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..b79ab4d162f61568f733ea15c6d24a4f11effc7b --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_182.txt @@ -0,0 +1,2 @@ +Future compatibility with Langchain +We love Langchain and think it has a very compelling suite of tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_183.txt b/chunked/content_aware_chunking/_custom_tools/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..15ea353a8a5e6656aa422ef240806fd736a1117e --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_183.txt @@ -0,0 +1,2 @@ +In order to handle these tools, +Langchain requires textual inputs and outputs, even when working with different modalities. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_184.txt b/chunked/content_aware_chunking/_custom_tools/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f092e653bc2af4d5b66b3a6d97fb04aadc12d62 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_184.txt @@ -0,0 +1 @@ +This is often the serialized version (i.e., saved to disk) of the objects. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_185.txt b/chunked/content_aware_chunking/_custom_tools/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..445818f2d898f9236c5cf8e6b9ad97ff22d3eaec --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_185.txt @@ -0,0 +1 @@ +This difference means that multi-modality isn't handled between transformers-agents and langchain. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_186.txt b/chunked/content_aware_chunking/_custom_tools/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf77006f51001ef93a17fddab60532c79fe9539d --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_186.txt @@ -0,0 +1,2 @@ +We aim for this limitation to be resolved in future versions, and welcome any help from avid langchain +users to help us achieve this compatibility. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_187.txt b/chunked/content_aware_chunking/_custom_tools/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e57a1b6b331f5c26ebe6105caaa8a48c835127f --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_187.txt @@ -0,0 +1 @@ +We would love to have better support. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_188.txt b/chunked/content_aware_chunking/_custom_tools/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..68055618d355682fb1c80849150adc37c06e5b33 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_188.txt @@ -0,0 +1,2 @@ +If you would like to help, please +open an issue and share what you have in mind. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_57.txt b/chunked/content_aware_chunking/_custom_tools/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..567a53e59fd63f011f5bd506aef3ef117fb53733 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_57.txt @@ -0,0 +1,3 @@ +This sentence makes up the final lines of the +prompt the agent is conditioned on, therefore strongly influencing the agent to finish the example +exactly in the same way it was previously done in the examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_58.txt b/chunked/content_aware_chunking/_custom_tools/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9278799fea2aa57e72446adfe8f9451f8fdc362 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_58.txt @@ -0,0 +1,2 @@ +Without going into too much detail, the chat template has the same prompt structure with the +examples having a slightly different style, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_59.txt b/chunked/content_aware_chunking/_custom_tools/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a83bc83507f32d83e6c992b558f0eed503a58ee --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_59.txt @@ -0,0 +1,5 @@ +: +````text +[] +===== +Human: Answer the question in the variable question about the image stored in the variable image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_60.txt b/chunked/content_aware_chunking/_custom_tools/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d20a12e5f620f08b5b3848df642a50b8ac4c21e --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_60.txt @@ -0,0 +1 @@ +Assistant: I will use the tool image_qa to answer the question on the input image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_61.txt b/chunked/content_aware_chunking/_custom_tools/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..245820a01fb92bc40ef9a0256c4d024c8afcd49d --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_61.txt @@ -0,0 +1,4 @@ +py +answer = image_qa(text=question, image=image) +print(f"The answer is {answer}") +Human: I tried this code, it worked but didn't give me a good result. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_62.txt b/chunked/content_aware_chunking/_custom_tools/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6e107ffea240293835abe1acc39b3d5d679d525 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_62.txt @@ -0,0 +1,2 @@ +The question is in French +Assistant: In this case, the question needs to be translated first. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_63.txt b/chunked/content_aware_chunking/_custom_tools/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..b93dac47df1b09c3254401ae734a37ab3d8f6511 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_63.txt @@ -0,0 +1 @@ +I will use the tool translator to do this. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_64.txt b/chunked/content_aware_chunking/_custom_tools/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbbae20d61258fa6f1d5dc7ac775c6156e87b294 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_64.txt @@ -0,0 +1,3 @@ +py +translated_question = translator(question=question, src_lang="French", tgt_lang="English") +print(f"The translated question is {translated_question}.") \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_65.txt b/chunked/content_aware_chunking/_custom_tools/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd890add4fc3a1001f6fa25855786f2a14717dca --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_65.txt @@ -0,0 +1,7 @@ +answer = image_qa(text=translated_question, image=image) +print(f"The answer is {answer}") +===== +[] +` +Contrary, to the examples of the run prompt, each chat prompt example has one or more exchanges between the +Human and the Assistant. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_66.txt b/chunked/content_aware_chunking/_custom_tools/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..30535c2f1043d90ae42204273d06e52ca8d8a1d4 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_66.txt @@ -0,0 +1 @@ +Every exchange is structured similarly to the example of the run prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_67.txt b/chunked/content_aware_chunking/_custom_tools/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0fba436966fa62cd8c5d7b3e54e93cfde3704c5 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_67.txt @@ -0,0 +1,2 @@ +The user's input is appended to behind Human: and the agent is prompted to first generate what needs to be done +before generating code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_68.txt b/chunked/content_aware_chunking/_custom_tools/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..a50d9acf39efc1f11d399b8bd1f2a679f5790160 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_68.txt @@ -0,0 +1,2 @@ +An exchange can be based on previous exchanges, therefore allowing the user to refer +to past exchanges as is done e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_69.txt b/chunked/content_aware_chunking/_custom_tools/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7201c275f89e0d7cfed82bbe44fbe422760527f --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_69.txt @@ -0,0 +1,2 @@ +above by the user's input of "I tried this code" refers to the +previously generated code of the agent. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_70.txt b/chunked/content_aware_chunking/_custom_tools/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..adbbe08db743f049859273b68404ee1e8520a1c9 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_70.txt @@ -0,0 +1,4 @@ +Upon running .chat, the user's input or task is cast into an unfinished example of the form: +text +Human: \n\nAssistant: +which the agent completes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_71.txt b/chunked/content_aware_chunking/_custom_tools/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b13c062364b5495dac85ba07f3eef9cd070849f --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_71.txt @@ -0,0 +1,2 @@ +Contrary to the run command, the chat command then appends the completed example +to the prompt, thus giving the agent more context for the next chat turn. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_72.txt b/chunked/content_aware_chunking/_custom_tools/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..979cfe51bf47b503d768bb0332eaf4cf23cc8cef --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_72.txt @@ -0,0 +1 @@ +Great now that we know how the prompt is structured, let's see how we can customize it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_73.txt b/chunked/content_aware_chunking/_custom_tools/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebfad7baac6a19eda34c1abda64aa4c7e330b859 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_73.txt @@ -0,0 +1,3 @@ +Writing good user inputs +While large language models are getting better and better at understanding users' intentions, it helps +enormously to be as precise as possible to help the agent pick the correct task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_74.txt b/chunked/content_aware_chunking/_custom_tools/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..80cd447c8293d17f2a5e78c17bb542bfd913a85a --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_74.txt @@ -0,0 +1,2 @@ +What does it mean to be +as precise as possible? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_75.txt b/chunked/content_aware_chunking/_custom_tools/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..a897ac1782def702c19726d965869e147df5d823 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_75.txt @@ -0,0 +1 @@ +The agent sees a list of tool names and their description in its prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_76.txt b/chunked/content_aware_chunking/_custom_tools/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..8128510d7dd068bf6d65960c02f51ed71ccf9dbd --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_76.txt @@ -0,0 +1,3 @@ +The more tools are added the +more difficult it becomes for the agent to choose the correct tool and it's even more difficult to choose +the correct sequences of tools to run. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_77.txt b/chunked/content_aware_chunking/_custom_tools/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..f225bb3e1f2d1cad8e025ef63bea0144c056c768 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_77.txt @@ -0,0 +1,2 @@ +Let's look at a common failure case, here we will only return +the code to analyze it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_78.txt b/chunked/content_aware_chunking/_custom_tools/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..96f24f905fd608bd75a2bd9f9a9ef39d65918282 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_78.txt @@ -0,0 +1,8 @@ +from transformers import HfAgent +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder") +agent.run("Show me a tree", return_code=True) + +gives: +``text +==Explanation from the agent== +I will use the following tool:image_segmenter` to create a segmentation mask for the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_79.txt b/chunked/content_aware_chunking/_custom_tools/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4cf8c4bf192e5782fd9710c2cdb8c3617d6fd2d --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_79.txt @@ -0,0 +1,4 @@ +==Code generated by the agent== +mask = image_segmenter(image, prompt="tree") + +which is probably not what we wanted. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_80.txt b/chunked/content_aware_chunking/_custom_tools/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..12816d233a63a831248f0e024aec5c78cbfb9e5b --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_80.txt @@ -0,0 +1 @@ +Instead, it is more likely that we want an image of a tree to be generated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_81.txt b/chunked/content_aware_chunking/_custom_tools/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..1576b1bed1ced3266aebe3d97b16ea89a23e8a22 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_81.txt @@ -0,0 +1,2 @@ +To steer the agent more towards using a specific tool it can therefore be very helpful to use important keywords that +are present in the tool's name and description. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_82.txt b/chunked/content_aware_chunking/_custom_tools/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..8211825d737ce6ed745054395daaa441294a3d72 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_82.txt @@ -0,0 +1 @@ +Let's have a look. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_83.txt b/chunked/content_aware_chunking/_custom_tools/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..c69e2fee291a48502dcbd7f881311d149ba73aff --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_83.txt @@ -0,0 +1,4 @@ +py +agent.toolbox["image_generator"].description +text +'This is a tool that creates an image according to a prompt, which is a text description. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_84.txt b/chunked/content_aware_chunking/_custom_tools/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0aab639a2aff1d34d26bbf78611e11b8971ba9d --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_84.txt @@ -0,0 +1 @@ +It takes an input named `prompt` which contains the image description and outputs an image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_85.txt b/chunked/content_aware_chunking/_custom_tools/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..18cc00af22f085c3dee90116e0986249886c6888 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_85.txt @@ -0,0 +1 @@ +The name and description make use of the keywords "image", "prompt", "create" and "generate". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_86.txt b/chunked/content_aware_chunking/_custom_tools/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b21f4e64c1f4397c0442fad02cdabf10ced8f45 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_86.txt @@ -0,0 +1 @@ +Using these words will most likely work better here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_87.txt b/chunked/content_aware_chunking/_custom_tools/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6426407ffd6f3038ba8f5647900c628e29e4590 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_87.txt @@ -0,0 +1 @@ +Let's refine our prompt a bit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_88.txt b/chunked/content_aware_chunking/_custom_tools/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..a30dcdbbcfde72ce95da52e0ee2a911da65554ae --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_88.txt @@ -0,0 +1,6 @@ +py +agent.run("Create an image of a tree", return_code=True) +gives: +``text +==Explanation from the agent== +I will use the following toolimage_generator` to generate an image of a tree. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_89.txt b/chunked/content_aware_chunking/_custom_tools/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9425a167ee86844b06d5da4f5ca39aa21e9ca73 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_89.txt @@ -0,0 +1,4 @@ +==Code generated by the agent== +image = image_generator(prompt="tree") + +Much better! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_90.txt b/chunked/content_aware_chunking/_custom_tools/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..86b3eddafef1d22151d6cbe2f0b1067b6bc9aba7 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_90.txt @@ -0,0 +1 @@ +That looks more like what we want. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_91.txt b/chunked/content_aware_chunking/_custom_tools/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..91d945622982117a1abc13d89108f9afe3407b85 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_91.txt @@ -0,0 +1,3 @@ +In short, when you notice that the agent struggles to +correctly map your task to the correct tools, try looking up the most pertinent keywords of the tool's name +and description and try refining your task request with it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_92.txt b/chunked/content_aware_chunking/_custom_tools/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e5d064d0725311b1c08519218c734d6c8c5c00 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_92.txt @@ -0,0 +1,2 @@ +Customizing the tool descriptions +As we've seen before the agent has access to each of the tools' names and descriptions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_93.txt b/chunked/content_aware_chunking/_custom_tools/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..74a15c16a5265048adffabc67dd7576f1fdeaf47 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_93.txt @@ -0,0 +1,3 @@ +The base tools +should have very precise names and descriptions, however, you might find that it could help to change the +the description or name of a tool for your specific use case. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_94.txt b/chunked/content_aware_chunking/_custom_tools/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..23a5f3a3839b587304b8b2c33ec3f7146797d63b --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_94.txt @@ -0,0 +1,3 @@ +This might become especially important +when you've added multiple tools that are very similar or if you want to use your agent only for a certain +domain, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_95.txt b/chunked/content_aware_chunking/_custom_tools/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..4970a72244896f1fee27f2774c02ddbdf0b55a07 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_95.txt @@ -0,0 +1 @@ +image generation and transformations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_96.txt b/chunked/content_aware_chunking/_custom_tools/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..001abd454487b6f0cbf8293c48d1685f697b4b52 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_96.txt @@ -0,0 +1,2 @@ +A common problem is that the agent confuses image generation with image transformation/modification when +used a lot for image generation tasks, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_97.txt b/chunked/content_aware_chunking/_custom_tools/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..796e033452a781254ef743e2108020134b68f6a5 --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_97.txt @@ -0,0 +1,6 @@ +py +agent.run("Make an image of a house and a car", return_code=True) +returns +``text +==Explanation from the agent== +I will use the following toolsimage_generatorto generate an image of a house andimage_transformer` to transform the image of a car into the image of a house. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_98.txt b/chunked/content_aware_chunking/_custom_tools/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..62c95891ff37361787381e269c58f710e62de0bd --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_98.txt @@ -0,0 +1,6 @@ +==Code generated by the agent== +house_image = image_generator(prompt="A house") +car_image = image_generator(prompt="A car") +house_car_image = image_transformer(image=car_image, prompt="A house") + +which is probably not exactly what we want here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_custom_tools/chunk_99.txt b/chunked/content_aware_chunking/_custom_tools/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac83f1fa40f94b1b43f64fff8fe92e49d53c9ef --- /dev/null +++ b/chunked/content_aware_chunking/_custom_tools/chunk_99.txt @@ -0,0 +1,2 @@ +It seems like the agent has a difficult time +to understand the difference between image_generator and image_transformer and often uses the two together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_31.txt b/chunked/content_aware_chunking/_debugging/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..b21b3e5da2faae7589e61930f334a9561cf03f56 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_31.txt @@ -0,0 +1 @@ +It is also possible to not specify TORCH_CUDA_ARCH_LIST and the build program automatically queries the GPU architecture of the build. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_32.txt b/chunked/content_aware_chunking/_debugging/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8dcad02cb91b510f2aad589a867d1ad1b5dcad8 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_32.txt @@ -0,0 +1 @@ +However, it may or may not match the actual GPU on the target machine which is why it is better to explicitly specify the correct architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_33.txt b/chunked/content_aware_chunking/_debugging/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..60373010390780941453e529109f14157bfbd05e --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_33.txt @@ -0,0 +1,8 @@ +For training on multiple machines with the same setup, you'll need to make a binary wheel: + +git clone https://github.com/microsoft/DeepSpeed/ +cd DeepSpeed +rm -rf build +TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \ +python setup.py build_ext -j8 bdist_wheel +This command generates a binary wheel that'll look something like dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_34.txt b/chunked/content_aware_chunking/_debugging/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..94b0963c4b88903e6c2fd8f0cde6983acd735d41 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_34.txt @@ -0,0 +1 @@ +Now you can install this wheel locally or on another machine. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_35.txt b/chunked/content_aware_chunking/_debugging/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1ddba78eeb0e5823c0a7b4dae907e1db9395f53 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_35.txt @@ -0,0 +1,3 @@ +pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl +Multi-GPU Network Issues Debug +When training or inferencing with DistributedDataParallel and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_36.txt b/chunked/content_aware_chunking/_debugging/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..c064d37e272ac74181f0fccf600ac3c25e0219bb --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_36.txt @@ -0,0 +1,5 @@ +wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py +For example to test how 2 GPUs interact do: + +python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +If both processes can talk to each and allocate GPU memory each will print an OK status. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_37.txt b/chunked/content_aware_chunking/_debugging/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..e029fa56c241877b15e63139d261a8d2c6713ca3 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_37.txt @@ -0,0 +1 @@ +For more GPUs or nodes adjust the arguments in the script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_38.txt b/chunked/content_aware_chunking/_debugging/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d7ae6dc5ea5ba599c61964e80c39bdb35b15f9d --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_38.txt @@ -0,0 +1 @@ +You will find a lot more details inside the diagnostics script and even a recipe to how you could run it in a SLURM environment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_39.txt b/chunked/content_aware_chunking/_debugging/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f84e2c8aa0298a92afda4fab0e96be3344d4b2a --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_39.txt @@ -0,0 +1,4 @@ +An additional level of debug is to add NCCL_DEBUG=INFO environment variable as follows: + +NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +This will dump a lot of NCCL-related debug information, which you can then search online if you find that some problems are reported. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_40.txt b/chunked/content_aware_chunking/_debugging/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..c44ab10ca439754ff47a587c12b030233ffb6193 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_40.txt @@ -0,0 +1 @@ +Or if you're not sure how to interpret the output you can share the log file in an Issue. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_41.txt b/chunked/content_aware_chunking/_debugging/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d34b4234457572b09ae735748516494e54dc1a3 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_41.txt @@ -0,0 +1,3 @@ +Underflow and Overflow Detection + +This feature is currently available for PyTorch-only. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_42.txt b/chunked/content_aware_chunking/_debugging/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..77736b8e3c99b062f9e7c1a3e97a5e5a87a879ce --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_42.txt @@ -0,0 +1 @@ +For multi-GPU training it requires DDP (torch.distributed.launch). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_43.txt b/chunked/content_aware_chunking/_debugging/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..719d288c9055088e86a1bce2e4046cc96c32f561 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_43.txt @@ -0,0 +1 @@ +This feature can be used with any nn.Module-based model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_44.txt b/chunked/content_aware_chunking/_debugging/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4251dbe346d645107e17aed12ae92134501f73b --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_44.txt @@ -0,0 +1,2 @@ +If you start getting loss=NaN or the model inhibits some other abnormal behavior due to inf or nan in +activations or weights one needs to discover where the first underflow or overflow happens and what led to it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_45.txt b/chunked/content_aware_chunking/_debugging/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7fc249eb79d4e4dec0a53a91b2b925f31ae74d9 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_45.txt @@ -0,0 +1,2 @@ +Luckily +you can accomplish that easily by activating a special module that will do the detection automatically. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_46.txt b/chunked/content_aware_chunking/_debugging/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..5dc3d577aba6f58a005e2785c0fec8ac6b5ea083 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_46.txt @@ -0,0 +1,5 @@ +If you're using [Trainer], you just need to add: + +--debug underflow_overflow +to the normal command line arguments, or pass debug="underflow_overflow" when creating the +[TrainingArguments] object. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_47.txt b/chunked/content_aware_chunking/_debugging/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..08d2dd496e4e02331fdd315d5b5efb6d092eb14d --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_47.txt @@ -0,0 +1,7 @@ +If you're using your own training loop or another Trainer you can accomplish the same with: +thon +from transformers.debug_utils import DebugUnderflowOverflow +debug_overflow = DebugUnderflowOverflow(model) + +[~debug_utils.DebugUnderflowOverflow] inserts hooks into the model that immediately after each +forward call will test input and output variables and also the corresponding module's weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_48.txt b/chunked/content_aware_chunking/_debugging/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f5fc401931159227341272ecd3ee591d5a76a3d --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_48.txt @@ -0,0 +1,41 @@ +As soon as inf or +nan is detected in at least one element of the activations or weights, the program will assert and print a report +like this (this was caught with google/mt5-small under fp16 mixed precision): +Detected inf/nan during batch_number=0 +Last 21 forward frames: +abs min abs max metadata + encoder.block.1.layer.1.DenseReluDense.dropout Dropout +0.00e+00 2.57e+02 input[0] +0.00e+00 2.85e+02 output +[] + encoder.block.2.layer.0 T5LayerSelfAttention +6.78e-04 3.15e+03 input[0] +2.65e-04 3.42e+03 output[0] + None output[1] +2.25e-01 1.00e+04 output[2] + encoder.block.2.layer.1.layer_norm T5LayerNorm +8.69e-02 4.18e-01 weight +2.65e-04 3.42e+03 input[0] +1.79e-06 4.65e+00 output + encoder.block.2.layer.1.DenseReluDense.wi_0 Linear +2.17e-07 4.50e+00 weight +1.79e-06 4.65e+00 input[0] +2.68e-06 3.70e+01 output + encoder.block.2.layer.1.DenseReluDense.wi_1 Linear +8.08e-07 2.66e+01 weight +1.79e-06 4.65e+00 input[0] +1.27e-04 2.37e+02 output + encoder.block.2.layer.1.DenseReluDense.dropout Dropout +0.00e+00 8.76e+03 input[0] +0.00e+00 9.74e+03 output + encoder.block.2.layer.1.DenseReluDense.wo Linear +1.01e-06 6.44e+00 weight +0.00e+00 9.74e+03 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense +1.79e-06 4.65e+00 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.dropout Dropout +3.18e-04 6.27e+04 input[0] +0.00e+00 inf output +The example output has been trimmed in the middle for brevity. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_49.txt b/chunked/content_aware_chunking/_debugging/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea251f5f55cdd6c022ba32acec4d66efcfa80a34 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_49.txt @@ -0,0 +1,2 @@ +The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames, +the inputs and outputs were in the range of 1e4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_50.txt b/chunked/content_aware_chunking/_debugging/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..15f3a439e75f70abfa11302b03c8f41294a8cb83 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_50.txt @@ -0,0 +1,2 @@ +So when this training was done under fp16 mixed precision the very +last step overflowed (since under fp16 the largest number before inf is 64e3). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_51.txt b/chunked/content_aware_chunking/_debugging/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..440a92f573dc79d68534c2d18a589755d8e8aa03 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_51.txt @@ -0,0 +1,3 @@ +To avoid overflows under +fp16 the activations must remain way below 1e4, because 1e4 * 1e4 = 1e8 so any matrix multiplication with +large activations is going to lead to a numerical overflow condition. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_52.txt b/chunked/content_aware_chunking/_debugging/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..9844958b972f297cef4f420c0a4ba4ccd2aba7de --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_52.txt @@ -0,0 +1 @@ +At the very start of the trace you can discover at which batch number the problem occurred (here Detected inf/nan during batch_number=0 means the problem occurred on the first batch). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_53.txt b/chunked/content_aware_chunking/_debugging/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..b66ed0b40f364aec404260d99b0db6d18534e47f --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_53.txt @@ -0,0 +1,2 @@ +Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting +for. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_54.txt b/chunked/content_aware_chunking/_debugging/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d9951c9c753b912e5ede9233e9fdee9ff113902 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_54.txt @@ -0,0 +1,7 @@ +If we look just at this frame: +encoder.block.2.layer.1.layer_norm T5LayerNorm +8.69e-02 4.18e-01 weight +2.65e-04 3.42e+03 input[0] +1.79e-06 4.65e+00 output +Here, encoder.block.2.layer.1.layer_norm indicates that it was a layer norm for the first layer, of the second +block of the encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_55.txt b/chunked/content_aware_chunking/_debugging/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c5ca2ce9c239ae5c1eca1723a15fa3771a8a87c --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_55.txt @@ -0,0 +1 @@ +And the specific calls of the forward is T5LayerNorm. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_56.txt b/chunked/content_aware_chunking/_debugging/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ab978abab066bd337e95f878b15817db3c04273 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_56.txt @@ -0,0 +1,25 @@ +Let's look at the last few frames of that report: +Detected inf/nan during batch_number=0 +Last 21 forward frames: +abs min abs max metadata +[] + encoder.block.2.layer.1.DenseReluDense.wi_0 Linear +2.17e-07 4.50e+00 weight +1.79e-06 4.65e+00 input[0] +2.68e-06 3.70e+01 output + encoder.block.2.layer.1.DenseReluDense.wi_1 Linear +8.08e-07 2.66e+01 weight +1.79e-06 4.65e+00 input[0] +1.27e-04 2.37e+02 output + encoder.block.2.layer.1.DenseReluDense.wo Linear +1.01e-06 6.44e+00 weight +0.00e+00 9.74e+03 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense +1.79e-06 4.65e+00 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.dropout Dropout +3.18e-04 6.27e+04 input[0] +0.00e+00 inf output +The last frame reports for Dropout.forward function with the first entry for the only input and the second for the +only output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_57.txt b/chunked/content_aware_chunking/_debugging/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a1afcbb511f83da0886cab05a519dd6f1b80fa3 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_57.txt @@ -0,0 +1 @@ +You can see that it was called from an attribute dropout inside DenseReluDense class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_58.txt b/chunked/content_aware_chunking/_debugging/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..f09f6eaeb255cb6b3ce1df61c3d7bb885c49889b --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_58.txt @@ -0,0 +1,2 @@ +We can see +that it happened during the first layer, of the 2nd block, during the very first batch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_59.txt b/chunked/content_aware_chunking/_debugging/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e6091d0c705e6a3ca84739eadc0b74f50f1f881 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_59.txt @@ -0,0 +1,2 @@ +Finally, the absolute largest +input elements was 6.27e+04 and same for the output was inf. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_60.txt b/chunked/content_aware_chunking/_debugging/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..1246760d7c554f0cb6bf7d15e414d515be80fdd5 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_60.txt @@ -0,0 +1,2 @@ +You can see here, that T5DenseGatedGeluDense.forward resulted in output activations, whose absolute max value was +around 62.7K, which is very close to fp16's top limit of 64K. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_61.txt b/chunked/content_aware_chunking/_debugging/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..377be1a08a98788ef81843a517da95e635537be7 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_61.txt @@ -0,0 +1,3 @@ +In the next frame we have Dropout which renormalizes +the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an +overflow (inf). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_62.txt b/chunked/content_aware_chunking/_debugging/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..552083cae551aa26bd3b61256e50bb6f071c43c1 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_62.txt @@ -0,0 +1,2 @@ +As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16 +numbers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_63.txt b/chunked/content_aware_chunking/_debugging/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b0969e30874c91c1e3c4149f668cb3958af3918 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_63.txt @@ -0,0 +1,19 @@ +Let's match the report to the code from models/t5/modeling_t5.py: +thon +class T5DenseGatedGeluDense(nn.Module): + def init(self, config): + super().init() + self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.gelu_act = ACT2FN["gelu_new"] +def forward(self, hidden_states): + hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states + +Now it's easy to see the dropout call, and all the previous calls as well. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_64.txt b/chunked/content_aware_chunking/_debugging/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd3ed07bfc369a83986bf0e13db1854e55ca3517 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_64.txt @@ -0,0 +1,2 @@ +Since the detection is happening in a forward hook, these reports are printed immediately after each forward +returns. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_65.txt b/chunked/content_aware_chunking/_debugging/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..c58952804cd478ffa46d3420f398ce54a61c06c9 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_65.txt @@ -0,0 +1,3 @@ +Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers +started to go up and most likely switch to the fp32 mode here, so that the numbers don't overflow when multiplied +or summed up. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_66.txt b/chunked/content_aware_chunking/_debugging/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..934204e08859eea7050b70449f96c3d95a6af032 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_66.txt @@ -0,0 +1 @@ +Of course, there might be other solutions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_67.txt b/chunked/content_aware_chunking/_debugging/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..737d2069d795ce2c2a748c98f76f5d356a5d7b99 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_67.txt @@ -0,0 +1,20 @@ +For example, we could turn off amp temporarily if it's +enabled, after moving the original forward into a helper wrapper, like so: +thon +def _forward(self, hidden_states): + hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states +import torch +def forward(self, hidden_states): + if torch.is_autocast_enabled(): + with torch.cuda.amp.autocast(enabled=False): + return self._forward(hidden_states) + else: + return self._forward(hidden_states) + +Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may +want to analyse the intermediary stages of any specific forward function as well. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_68.txt b/chunked/content_aware_chunking/_debugging/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..d160d5ff1f15ba3e34777d9476e5628d9ac42d72 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_68.txt @@ -0,0 +1,15 @@ +In such a case you can use the +detect_overflow helper function to inject the detector where you want it, for example: +thon +from debug_utils import detect_overflow +class T5LayerFF(nn.Module): + [] +def forward(self, hidden_states): + forwarded_states = self.layer_norm(hidden_states) + detect_overflow(forwarded_states, "after layer_norm") + forwarded_states = self.DenseReluDense(forwarded_states) + detect_overflow(forwarded_states, "after DenseReluDense") + return hidden_states + self.dropout(forwarded_states) + +You can see that we added 2 of these and now we track if inf or nan for forwarded_states was detected +somewhere in between. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_69.txt b/chunked/content_aware_chunking/_debugging/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..403dce3f9caa34032ed7057a869273a682c87142 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_69.txt @@ -0,0 +1,2 @@ +Actually, the detector already reports these because each of the calls in the example above is a nn.Module, but +let's say if you had some local direct calculations this is how you'd do that. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_70.txt b/chunked/content_aware_chunking/_debugging/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..09e93f3435d622cc0cb1ca262eec0391cee8d142 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_70.txt @@ -0,0 +1,2 @@ +Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from +its default, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_71.txt b/chunked/content_aware_chunking/_debugging/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..114730fc7de6442496724627a3cff5f09fa3a022 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_71.txt @@ -0,0 +1,7 @@ +: +thon +from transformers.debug_utils import DebugUnderflowOverflow +debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100) + +Specific batch absolute min and max value tracing +The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_72.txt b/chunked/content_aware_chunking/_debugging/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6de98ff64e1221c58d95607cb15b38bb8f6e471 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_72.txt @@ -0,0 +1,2 @@ +Let's say you want to watch the absolute min and max values for all the ingredients of each forward call of a given +batch, and only do that for batches 1 and 3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_73.txt b/chunked/content_aware_chunking/_debugging/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..35593fee296d03f9a3604e1cc42447b66c724b67 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_73.txt @@ -0,0 +1,4 @@ +Then you instantiate this class as: +python +debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3]) +And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_74.txt b/chunked/content_aware_chunking/_debugging/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..e86ff0b3d2a2fe5e0bc420d3ec3e973c9dcc3bd4 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_74.txt @@ -0,0 +1 @@ +Batches are 0-indexed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_75.txt b/chunked/content_aware_chunking/_debugging/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..409f5c82dc9402c646f47dd5eb9bf2b54b4b4393 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_75.txt @@ -0,0 +1,2 @@ +This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward +right to that area. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_76.txt b/chunked/content_aware_chunking/_debugging/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..a89e44e6e9339e5acaec68b6b32e3934c78082d3 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_76.txt @@ -0,0 +1,31 @@ +Here is a sample truncated output for such configuration: + + *** Starting batch number=1 *** +abs min abs max metadata + shared Embedding +1.01e-06 7.92e+02 weight +0.00e+00 2.47e+04 input[0] +5.36e-05 7.92e+02 output +[] + decoder.dropout Dropout +1.60e-07 2.27e+01 input[0] +0.00e+00 2.52e+01 output + decoder T5Stack + not a tensor output + lm_head Linear +1.01e-06 7.92e+02 weight +0.00e+00 1.11e+00 input[0] +6.06e-02 8.39e+01 output + T5ForConditionalGeneration + not a tensor output + *** Starting batch number=3 *** + +abs min abs max metadata + shared Embedding +1.01e-06 7.92e+02 weight +0.00e+00 2.78e+04 input[0] +5.36e-05 7.92e+02 output +[] + +Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may +not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_77.txt b/chunked/content_aware_chunking/_debugging/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c04b91ee4ed793a13a2a6d3221652dd91e3a8ac --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_77.txt @@ -0,0 +1,2 @@ +For example, if +a problem starts happening at batch number 150. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_78.txt b/chunked/content_aware_chunking/_debugging/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..67344d79b08f6d379d0d55fbf65be89833fc5c6b --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_78.txt @@ -0,0 +1,2 @@ +So you can dump traces for batches 149 and 150 and compare where +numbers started to diverge. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_debugging/chunk_79.txt b/chunked/content_aware_chunking/_debugging/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1def4733b6f1be695ecb2e8d9f90488c3fdd9b7 --- /dev/null +++ b/chunked/content_aware_chunking/_debugging/chunk_79.txt @@ -0,0 +1,3 @@ +You can also specify the batch number after which to stop the training, with: +python +debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3) \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_100.txt b/chunked/content_aware_chunking/_deepspeed/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd066afd1ac7b765ce7a7ffe5c82d320620af36c --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_100.txt @@ -0,0 +1,3 @@ +To enable this feature: + +For a Hugging Face model, set model.gradient_checkpointing_enable() or --gradient_checkpointing in the [Trainer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_101.txt b/chunked/content_aware_chunking/_deepspeed/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..995e677bb45b22fadd0eda2cadf7a331ca365c4e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_101.txt @@ -0,0 +1 @@ +For a non-Hugging Face model, use the DeepSpeed Activation Checkpointing API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_102.txt b/chunked/content_aware_chunking/_deepspeed/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..859194207c2fdeb7146c234f3d6d01ea564cddaf --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_102.txt @@ -0,0 +1 @@ +You could also replace the Transformers modeling code and replace torch.utils.checkpoint with the DeepSpeed API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_103.txt b/chunked/content_aware_chunking/_deepspeed/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..7aab68dbbfbe3a6c7a6585d556d1a9e30e7c5448 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_103.txt @@ -0,0 +1 @@ +This approach is more flexible because you can offload the forward activations to the CPU memory instead of recalculating them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_104.txt b/chunked/content_aware_chunking/_deepspeed/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..925c498422515f0a10b2baefad73c899cc40029b --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_104.txt @@ -0,0 +1,2 @@ +Optimizer and scheduler +DeepSpeed and Transformers optimizer and scheduler can be mixed and matched as long as you don't enable offload_optimizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_105.txt b/chunked/content_aware_chunking/_deepspeed/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7b217481f8b78e259918b1e09d8c4662132a305 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_105.txt @@ -0,0 +1 @@ +When offload_optimizer is enabled, you could use a non-DeepSpeed optimizer (except for LAMB) as long as it has both a CPU and GPU implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_106.txt b/chunked/content_aware_chunking/_deepspeed/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..6768211b0932f6c9628796f90eafac615dc413cf --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_106.txt @@ -0,0 +1 @@ +The optimizer and scheduler parameters for the config file can be set from the command line to avoid hard to find errors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_107.txt b/chunked/content_aware_chunking/_deepspeed/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ba7673bc6a92e96e4b920beea2a8d614230dc6b --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_107.txt @@ -0,0 +1 @@ +For example, if the learning rate is set to a different value in another place you can override it from the command line. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_108.txt b/chunked/content_aware_chunking/_deepspeed/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..319db7e93ad59888a79a328918c6e650b16e9a12 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_108.txt @@ -0,0 +1 @@ +Aside from the optimizer and scheduler parameters, you'll need to ensure your [Trainer] command line arguments match the DeepSpeed configuration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_109.txt b/chunked/content_aware_chunking/_deepspeed/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..2802f420df07c241259cb33895fa102a3d06afe1 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_109.txt @@ -0,0 +1 @@ +DeepSpeed offers several optimizers (Adam, AdamW, OneBitAdam, and LAMB) but you can also import other optimizers from PyTorch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_110.txt b/chunked/content_aware_chunking/_deepspeed/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..3220850784a489cc6b029a65dd84b5a7b0c0cb86 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_110.txt @@ -0,0 +1 @@ +If you don't configure the optimizer in the config, the [Trainer] automatically selects AdamW and either uses the supplied values or the default values for the following parameters from the command line: lr, adam_beta1, adam_beta2, adam_epsilon, weight_decay. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_111.txt b/chunked/content_aware_chunking/_deepspeed/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..086893c14dcbdd58458950634bdbdb1f3d81c9da --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_111.txt @@ -0,0 +1 @@ +You can set the parameters to "auto" or manually input your own desired values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_112.txt b/chunked/content_aware_chunking/_deepspeed/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bba430de8144803f61de298d41c25bee1894193 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_112.txt @@ -0,0 +1,13 @@ +yaml +{ + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + } +} +You can also use an unsupported optimizer by adding the following to the top level configuration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_113.txt b/chunked/content_aware_chunking/_deepspeed/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f9d53668a07567279e2d1c68014251857d5cdbd --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_113.txt @@ -0,0 +1,5 @@ +yaml +{ + "zero_allow_untested_optimizer": true +} +From DeepSpeed==0.8.3 on, if you want to use offload, you'll also need to the following to the top level configuration because offload works best with DeepSpeed's CPU Adam optimizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_114.txt b/chunked/content_aware_chunking/_deepspeed/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..98947116029f97f100beec3e121c75ec23bc3ae1 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_114.txt @@ -0,0 +1,6 @@ +yaml +{ + "zero_force_ds_cpu_optimizer": false +} + +DeepSpeed supports the LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR learning rate schedulers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_115.txt b/chunked/content_aware_chunking/_deepspeed/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..055f9385d5f59c2c228936bf86f1082d2642be7b --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_115.txt @@ -0,0 +1,6 @@ +Transformers and DeepSpeed provide two of the same schedulers: + +WarmupLR is the same as --lr_scheduler_type constant_with_warmup in Transformers +WarmupDecayLR is the same as --lr_scheduler_type linear in Transformers (this is the default scheduler used in Transformers) + +If you don't configure the scheduler in the config, the [Trainer] automatically selects WarmupDecayLR and either uses the supplied values or the default values for the following parameters from the command line: warmup_min_lr, warmup_max_lr, warmup_num_steps, total_num_steps (automatically calculated during run time if max_steps is not provided). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_116.txt b/chunked/content_aware_chunking/_deepspeed/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..086893c14dcbdd58458950634bdbdb1f3d81c9da --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_116.txt @@ -0,0 +1 @@ +You can set the parameters to "auto" or manually input your own desired values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_117.txt b/chunked/content_aware_chunking/_deepspeed/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..6bc08acf6880006550a6aaf18824d7db657bb23a --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_117.txt @@ -0,0 +1,15 @@ +yaml +{ + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "total_num_steps": "auto", + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + } +} + +Precision +Deepspeed supports fp32, fp16, and bf16 mixed precision. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_118.txt b/chunked/content_aware_chunking/_deepspeed/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..7afc3facf6187ba6340c9954c24900c72cb8240c --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_118.txt @@ -0,0 +1 @@ +If your model doesn't work well with mixed precision, for example if it wasn't pretrained in mixed precision, you may encounter overflow or underflow issues which can cause NaN loss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_119.txt b/chunked/content_aware_chunking/_deepspeed/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b939bf5f5e05dd4f5dd97a31753a8a9ef5d40ec --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_119.txt @@ -0,0 +1 @@ +For these cases, you should use full fp32 precision by explicitly disabling the default fp16 mode. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_120.txt b/chunked/content_aware_chunking/_deepspeed/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a10ca23dabf926a21deacdcc1cd009707c97708 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_120.txt @@ -0,0 +1,7 @@ +yaml +{ + "fp16": { + "enabled": false + } +} +For Ampere GPUs and PyTorch > 1.7, it automatically switches to the more efficient tf32 format for some operations but the results are still in fp32. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_121.txt b/chunked/content_aware_chunking/_deepspeed/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..77dc85a35b2e99816cd7099e1018af4268ed4354 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_121.txt @@ -0,0 +1 @@ +You can control it from the [Trainer] by setting --tf32 to enable it, and --tf32 0 or --no_tf32 to disable it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_122.txt b/chunked/content_aware_chunking/_deepspeed/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..845b53006f266209b8d3b1fd1c46239ea7297474 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_122.txt @@ -0,0 +1 @@ +To configure PyTorch AMP-like fp16 mixed precision reduces memory usage and accelerates training speed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_123.txt b/chunked/content_aware_chunking/_deepspeed/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..d41b8c5eb903623dc195578591e1fd6264f4a52e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_123.txt @@ -0,0 +1 @@ +[Trainer] automatically enables or disables fp16 based on the value of args.fp16_backend, and the rest of the config can be set by you. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_124.txt b/chunked/content_aware_chunking/_deepspeed/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..21e9a1b99989a49511d1f8df38cd2c76f77c06fa --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_124.txt @@ -0,0 +1 @@ +fp16 is enabled from the command line when the following arguments are passed: --fp16, --fp16_backend amp or --fp16_full_eval. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_125.txt b/chunked/content_aware_chunking/_deepspeed/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..bde5d8ef409331cf4a7315c2ef7645227d584636 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_125.txt @@ -0,0 +1,12 @@ +yaml +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + } +} +For additional DeepSpeed fp16 training options, take a look at the FP16 Training Options reference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_126.txt b/chunked/content_aware_chunking/_deepspeed/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..835f4eb830f43f056662ac8f064891e79cec1e74 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_126.txt @@ -0,0 +1 @@ +To configure Apex-like fp16 mixed precision, setup the config as shown below with "auto" or your own values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_127.txt b/chunked/content_aware_chunking/_deepspeed/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d10bec138666df2623f13e3889e2df91b02f238 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_127.txt @@ -0,0 +1 @@ +[Trainer] automatically configure amp based on the values of args.fp16_backend and args.fp16_opt_level. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_128.txt b/chunked/content_aware_chunking/_deepspeed/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..216558275295fc0723fca0c1358539eb13129428 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_128.txt @@ -0,0 +1,10 @@ +It can also be enabled from the command line when the following arguments are passed: --fp16, --fp16_backend apex or --fp16_opt_level 01. +yaml +{ + "amp": { + "enabled": "auto", + "opt_level": "auto" + } +} + +To use bf16, you'll need at least DeepSpeed==0.6.0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_129.txt b/chunked/content_aware_chunking/_deepspeed/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc26f17c533469df2d367f0ed390e1f72f0eb2f5 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_129.txt @@ -0,0 +1 @@ +bf16 has the same dynamic range as fp32 and doesn’t require loss scaling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_130.txt b/chunked/content_aware_chunking/_deepspeed/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..dafe0cd2de214997e7a87fffc8b2eeff503d56ae --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_130.txt @@ -0,0 +1 @@ +However, if you use gradient accumulation with bf16, gradients are accumulated in bf16 which may not be desired because this format's low precision can lead to lossy accumulation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_131.txt b/chunked/content_aware_chunking/_deepspeed/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..0df203aac9e3316d2a53a68274aa2b8f4c8fabf9 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_131.txt @@ -0,0 +1 @@ +bf16 can be setup in the config file or enabled from the command line when the following arguments are passed: --bf16 or --bf16_full_eval. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_132.txt b/chunked/content_aware_chunking/_deepspeed/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..6853bbe7248af723a86b5d7a1ba47a794373ac74 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_132.txt @@ -0,0 +1,9 @@ +yaml +{ + "bf16": { + "enabled": "auto" + } +} + +Batch size +The batch size can be auto-configured or explicitly set. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_133.txt b/chunked/content_aware_chunking/_deepspeed/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fdba0118a88369f63cdd15a35866c7f954d3ef6 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_133.txt @@ -0,0 +1 @@ +If you choose to use the "auto" option, [Trainer] sets train_micro_batch_size_per_gpu to the value of args.per_device_train_batch_size and train_batch_size to args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_134.txt b/chunked/content_aware_chunking/_deepspeed/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a90a6d659eb0d5a23c195771067ca46bd5c5438 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_134.txt @@ -0,0 +1,7 @@ +yaml +{ + "train_micro_batch_size_per_gpu": "auto", + "train_batch_size": "auto" +} +Gradient accumulation +Gradient accumulation can be auto-configured or explicitly set. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_135.txt b/chunked/content_aware_chunking/_deepspeed/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..05323339dae0ac7687ac11712bae77b072893c3e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_135.txt @@ -0,0 +1 @@ +If you choose to use the "auto" option, [Trainer] sets it to the value of args.gradient_accumulation_steps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_136.txt b/chunked/content_aware_chunking/_deepspeed/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5d9d5bf7269c1c66fcbcb09bd2242f71fee3326 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_136.txt @@ -0,0 +1,7 @@ +```yaml +{ + "gradient_accumulation_steps": "auto" +} + +Gradient clipping +Gradient clipping can be auto-configured or explicitly set. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_137.txt b/chunked/content_aware_chunking/_deepspeed/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..945962cf02e7c6890d872fb9be23f39334a1d341 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_137.txt @@ -0,0 +1 @@ +If you choose to use the "auto" option, [Trainer] sets it to the value of args.max_grad_norm. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_138.txt b/chunked/content_aware_chunking/_deepspeed/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb2411657ee8dd35c20224e5855c76e2f1972697 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_138.txt @@ -0,0 +1,6 @@ +yaml +{ + "gradient_clipping": "auto" +} +Communication data type +For communication collectives like reduction, gathering and scattering operations, a separate data type is used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_139.txt b/chunked/content_aware_chunking/_deepspeed/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..da405ed0f2934113617e9c4e8c0423e7395d0661 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_139.txt @@ -0,0 +1 @@ +All gather and scatter operations are performed in the same data type the data is in. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_140.txt b/chunked/content_aware_chunking/_deepspeed/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bfed7876df8b4d0fb81cd6e72bfa6d03448c7fa --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_140.txt @@ -0,0 +1 @@ +For example, if you're training with bf16, the data is also gathered in bf16 because gathering is a non-lossy operation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_141.txt b/chunked/content_aware_chunking/_deepspeed/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e74ac9836128c3b96070afcf93d9d0307f26b0e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_141.txt @@ -0,0 +1 @@ +Reduce operations are lossy, for example when gradients are averaged across multiple GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_142.txt b/chunked/content_aware_chunking/_deepspeed/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..497a0514c381b5531bbdc84ea979b45f28ed841b --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_142.txt @@ -0,0 +1 @@ +When the communication is done in fp16 or bf16, it is more likely to be lossy because adding multiple numbers in low precision isn't exact. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_143.txt b/chunked/content_aware_chunking/_deepspeed/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..386810deee25b18c57554b9ac286e9ff1f57de74 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_143.txt @@ -0,0 +1 @@ +This is especially the case with bf16 which has a lower precision than fp16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_144.txt b/chunked/content_aware_chunking/_deepspeed/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..262042d4bf4592b55d87ebfbc8f2a66ff7d7eca9 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_144.txt @@ -0,0 +1 @@ +For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_145.txt b/chunked/content_aware_chunking/_deepspeed/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..05c8f8dcb484a677ab0274df3dc24730f432cca3 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_145.txt @@ -0,0 +1 @@ +You can choose the communication data type by setting the communication_data_type parameter in the config file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_146.txt b/chunked/content_aware_chunking/_deepspeed/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..55f4470e94c6e8cce5dcdd06cb40ce1049197128 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_146.txt @@ -0,0 +1 @@ +For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it is downcasted to whichever half-precision dtype you're training in. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_147.txt b/chunked/content_aware_chunking/_deepspeed/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad428106230be7a998fb85802fa6f853a959dc75 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_147.txt @@ -0,0 +1,6 @@ +yaml +{ + "communication_data_type": "fp32" +} +Deployment +DeepSpeed can be deployed by different launchers such as torchrun, the deepspeed launcher, or Accelerate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_148.txt b/chunked/content_aware_chunking/_deepspeed/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f279ac48604cd3782a4dd8c1cb161a2874c25cb --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_148.txt @@ -0,0 +1 @@ +To deploy, add --deepspeed ds_config.json to the [Trainer] command line. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_149.txt b/chunked/content_aware_chunking/_deepspeed/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..402e9c66b46bd4375dfbe3c766d34ae56fb15400 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_149.txt @@ -0,0 +1 @@ +It’s recommended to use DeepSpeed’s add_config_arguments utility to add any necessary command line arguments to your code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_150.txt b/chunked/content_aware_chunking/_deepspeed/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..765871608f0ecfaeb14b2b129e8d3c83485356b7 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_150.txt @@ -0,0 +1 @@ +This guide will show you how to deploy DeepSpeed with the deepspeed launcher for different training setups. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_151.txt b/chunked/content_aware_chunking/_deepspeed/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f33e96ded28f3b527317051fe5b938ce8944f4c --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_151.txt @@ -0,0 +1 @@ +You can check out this post for more practical usage examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_152.txt b/chunked/content_aware_chunking/_deepspeed/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..22d848dc8f9aab9c076500f4b3bda7805ec2410e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_152.txt @@ -0,0 +1 @@ +To deploy DeepSpeed on multiple GPUs, add the --num_gpus parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_153.txt b/chunked/content_aware_chunking/_deepspeed/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..d51ba104b7a5278e6ca511b963ca4f4b1b6d6e64 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_153.txt @@ -0,0 +1 @@ +If you want to use all available GPUs, you don't need to add --num_gpus. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_154.txt b/chunked/content_aware_chunking/_deepspeed/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad6beea89aa03967ae89050edd26f12819c2d733 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_154.txt @@ -0,0 +1 @@ +The example below uses 2 GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_155.txt b/chunked/content_aware_chunking/_deepspeed/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a61f9f715247f588d8a7267c05d3b7c47039f14 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_155.txt @@ -0,0 +1,9 @@ +deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \ +--deepspeed tests/deepspeed/ds_config_zero3.json \ +--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ +--output_dir output_dir --overwrite_output_dir --fp16 \ +--do_train --max_train_samples 500 --num_train_epochs 1 \ +--dataset_name wmt16 --dataset_config "ro-en" \ +--source_lang en --target_lang ro + +To deploy DeepSpeed on a single GPU, add the --num_gpus parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_156.txt b/chunked/content_aware_chunking/_deepspeed/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1f13c10d8f8ddebe410171fbe7c828bd880804d --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_156.txt @@ -0,0 +1 @@ +It isn't necessary to explicitly set this value if you only have 1 GPU because DeepSpeed deploys all GPUs it can see on a given node. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_157.txt b/chunked/content_aware_chunking/_deepspeed/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..840d98150b1431bd5deceb4885243fcc0e44695e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_157.txt @@ -0,0 +1,10 @@ +deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ +--deepspeed tests/deepspeed/ds_config_zero2.json \ +--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ +--output_dir output_dir --overwrite_output_dir --fp16 \ +--do_train --max_train_samples 500 --num_train_epochs 1 \ +--dataset_name wmt16 --dataset_config "ro-en" \ +--source_lang en --target_lang ro +DeepSpeed is still useful with just 1 GPU because you can: + +Offload some computations and memory to the CPU to make more GPU resources available to your model to use a larger batch size or fit a very large model that normally won't fit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_158.txt b/chunked/content_aware_chunking/_deepspeed/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c4719a5bff908aa1a02cc2cc4b88f7c5e14c0df --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_158.txt @@ -0,0 +1 @@ +Minimize memory fragmentation with it's smart GPU memory management system which also allows you to fit bigger models and data batches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_159.txt b/chunked/content_aware_chunking/_deepspeed/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..946eb4478b0962236d9759c55c187fe917f7b9be --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_159.txt @@ -0,0 +1 @@ +Set the allgather_bucket_size and reduce_bucket_size values to 2e8 in the ZeRO-2 configuration file to get better performance on a single GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_160.txt b/chunked/content_aware_chunking/_deepspeed/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..ddff58a357cfa4b0fc93e6ca76c5c7d83495354e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_160.txt @@ -0,0 +1,2 @@ +Multi-node deployment +A node is one or more GPUs for running a workload. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_161.txt b/chunked/content_aware_chunking/_deepspeed/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..e14986e787881982d8737ef1f281c8c14f11fe7d --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_161.txt @@ -0,0 +1 @@ +A more powerful setup is a multi-node setup which can be launched with the deepspeed launcher. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_162.txt b/chunked/content_aware_chunking/_deepspeed/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9858e925971e286f831b8aa244c27d1c6f3c6ac --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_162.txt @@ -0,0 +1 @@ +For this guide, let's assume there are two nodes with 8 GPUs each. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_163.txt b/chunked/content_aware_chunking/_deepspeed/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cf69dddd1f1ef034fc1b71da136226089491bfb --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_163.txt @@ -0,0 +1 @@ +The first node can be accessed ssh hostname1 and the second node with ssh hostname2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_164.txt b/chunked/content_aware_chunking/_deepspeed/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb6d50eadd90633779fd1e10d7c27265af7e01b3 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_164.txt @@ -0,0 +1 @@ +Both nodes must be able to communicate with each other locally over ssh without a password. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_165.txt b/chunked/content_aware_chunking/_deepspeed/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6b0a97b0df0ee0e1066d049743c9ee13a717ebc --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_165.txt @@ -0,0 +1 @@ +By default, DeepSpeed expects your multi-node environment to use a shared storage. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_166.txt b/chunked/content_aware_chunking/_deepspeed/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb5f7d68fda5d64b42c9e817e4dfb6273228eb77 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_166.txt @@ -0,0 +1,8 @@ +If this is not the case and each node can only see the local filesystem, you need to adjust the config file to include a checkpoint to allow loading without access to a shared filesystem: +yaml +{ + "checkpoint": { + "use_node_local_storage": true + } +} +You could also use the [Trainer]'s --save_on_each_node argument to automatically add the above checkpoint to your config. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_167.txt b/chunked/content_aware_chunking/_deepspeed/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..900d8af2412a2f199228dd6cc998b7ac3b704538 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_167.txt @@ -0,0 +1 @@ +For torchrun, you have to ssh to each node and run the following command on both of them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_168.txt b/chunked/content_aware_chunking/_deepspeed/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..adf2f8e4025ddda62c90abf3e7579c01fc43dffd --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_168.txt @@ -0,0 +1 @@ +The launcher waits until both nodes are synchronized before launching the training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_169.txt b/chunked/content_aware_chunking/_deepspeed/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e66677e60dbce0e2384fe52e271f34f1a5c3cbc --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_169.txt @@ -0,0 +1,4 @@ +python -m torch.run --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \ +--master_port=9901 your_program.py --deepspeed ds_config.json + +For the deepspeed launcher, start by creating a hostfile. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_170.txt b/chunked/content_aware_chunking/_deepspeed/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..49d0114626bf1002577d410ad00ea17f4318065a --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_170.txt @@ -0,0 +1,3 @@ +hostname1 slots=8 +hostname2 slots=8 +Then you can launch the training with the following command. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_171.txt b/chunked/content_aware_chunking/_deepspeed/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..f37dbdf4ba397d21adb9ce613e2a659d232849ac --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_171.txt @@ -0,0 +1 @@ +The deepspeed launcher automatically launches the command on both nodes at once. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_172.txt b/chunked/content_aware_chunking/_deepspeed/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..347ef6cd37217f7fd43d02adebb2065a257340a1 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_172.txt @@ -0,0 +1,3 @@ +deepspeed --num_gpus 8 --num_nodes 2 --hostfile hostfile --master_addr hostname1 --master_port=9901 \ +your_program.py --deepspeed ds_config.json +Check out the Resource Configuration (multi-node) guide for more details about configuring multi-node compute resources. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_173.txt b/chunked/content_aware_chunking/_deepspeed/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..229089cf5dfa9b25e0b12d818dc2c084fe835230 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_173.txt @@ -0,0 +1,2 @@ +SLURM +In a SLURM environment, you'll need to adapt your SLURM script to your specific SLURM environment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_174.txt b/chunked/content_aware_chunking/_deepspeed/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac72e73fc6000b27498decffd6a37e518191580e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_174.txt @@ -0,0 +1,5 @@ +An example SLURM script may look like: +```bash +SBATCH --job-name=test-nodes # name +SBATCH --nodes=2 # nodes +SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_175.txt b/chunked/content_aware_chunking/_deepspeed/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd5d3987df92e1309049cdabc78c0a6edbe8a636 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_175.txt @@ -0,0 +1,13 @@ +SBATCH --cpus-per-task=10 # number of cores per tasks +SBATCH --gres=gpu:8 # number of gpus +SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +SBATCH --output=%x-%j.out # output file name +export GPUS_PER_NODE=8 +export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export MASTER_PORT=9901 +srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \ + --master_addr $MASTER_ADDR --master_port $MASTER_PORT \ +your_program.py --deepspeed ds_config.json' + +Then you can schedule your multi-node deployment with the following command which launches training simultaneously on all nodes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_176.txt b/chunked/content_aware_chunking/_deepspeed/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..d87d2a53cb65ecd970caa1a41aa156eecaef2e65 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_176.txt @@ -0,0 +1,3 @@ +sbatch launch.slurm +Notebook +The deepspeed launcher doesn't support deployment from a notebook so you'll need to emulate the distributed environment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_177.txt b/chunked/content_aware_chunking/_deepspeed/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..68aa56f8bf2a8ce0bef8a96141b873e5f3d9165e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_177.txt @@ -0,0 +1 @@ +However, this only works for 1 GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_178.txt b/chunked/content_aware_chunking/_deepspeed/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb99d480e7d223e40a6707e216c33c653773500e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_178.txt @@ -0,0 +1 @@ +If you want to use more than 1 GPU, you must use a multi-process environment for DeepSpeed to work. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_179.txt b/chunked/content_aware_chunking/_deepspeed/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..56b453d4ef8fed1be236376fe80c8f93dabab5c5 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_179.txt @@ -0,0 +1 @@ +This means you have to use the deepspeed launcher which can't be emulated as shown here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_180.txt b/chunked/content_aware_chunking/_deepspeed/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a9c8764b75f7e4d08b92bb76ab3c3c10a584758 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_180.txt @@ -0,0 +1 @@ +DeepSpeed requires a distributed environment even when only one process is used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_181.txt b/chunked/content_aware_chunking/_deepspeed/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..08071098fd3c6ab6445849e342b5584574f6ad93 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_181.txt @@ -0,0 +1,13 @@ +This emulates a launcher in the notebook +import os +os.environ["MASTER_ADDR"] = "localhost" +os.environ["MASTER_PORT"] = "9994" # modify if RuntimeError: Address already in use +os.environ["RANK"] = "0" +os.environ["LOCAL_RANK"] = "0" +os.environ["WORLD_SIZE"] = "1" +Now proceed as normal, plus pass the DeepSpeed config file +training_args = TrainingArguments(, deepspeed="ds_config_zero3.json") +trainer = Trainer() +trainer.train() + +If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated cell. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_182.txt b/chunked/content_aware_chunking/_deepspeed/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..268d7ba1f5fe96510ac97a2fead8df704f3815af --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_182.txt @@ -0,0 +1,62 @@ +%%bash +cat <<'EOT' > ds_config_zero3.json +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, +"optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } +}, + +"scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } +}, + +"zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true +}, + +"gradient_accumulation_steps": "auto", +"gradient_clipping": "auto", +"steps_per_print": 2000, +"train_batch_size": "auto", +"train_micro_batch_size_per_gpu": "auto", +"wall_clock_breakdown": false + +} +EOT + +If the training script is in a file and not in a notebook cell, you can launch deepspeed normally from the shell in a notebook cell. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_183.txt b/chunked/content_aware_chunking/_deepspeed/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..c96aa920e0b27760816f4214fb41460aa233c176 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_183.txt @@ -0,0 +1,5 @@ +For example, to launch run_translation.py: +py +!git clone https://github.com/huggingface/transformers +!cd transformers; deepspeed examples/pytorch/translation/run_translation.py +You could also use %%bash magic and write multi-line code to run the shell program, but you won't be able to view the logs until training is complete. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_184.txt b/chunked/content_aware_chunking/_deepspeed/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ebfd831ec720a1a10066965ef7b7676df4238fd --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_184.txt @@ -0,0 +1 @@ +With %%bash magic, you don't need to emulate a distributed environment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_185.txt b/chunked/content_aware_chunking/_deepspeed/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..179f25c3c934d255dc8d1f4525487fdc84fd3118 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_185.txt @@ -0,0 +1,7 @@ +%%bash +git clone https://github.com/huggingface/transformers +cd transformers +deepspeed examples/pytorch/translation/run_translation.py + +Save model weights +DeepSpeed stores the main full precision fp32 weights in custom checkpoint optimizer files (the glob pattern looks like global_step*/*optim_states.pt) and are saved under the normal checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_186.txt b/chunked/content_aware_chunking/_deepspeed/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..6832f83212792d63bdc6323a1085001e95563e6c --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_186.txt @@ -0,0 +1 @@ +A model trained with ZeRO-2 saves the pytorch_model.bin weights in fp16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_187.txt b/chunked/content_aware_chunking/_deepspeed/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..76c69d68e5aaac2303bd93cfc801277742b70614 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_187.txt @@ -0,0 +1 @@ +To save the model weights in fp16 for a model trained with ZeRO-3, you need to set "stage3_gather_16bit_weights_on_model_save": true because the model weights are partitioned across multiple GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_188.txt b/chunked/content_aware_chunking/_deepspeed/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d111d84238666c93ed3105fdf7a90a3971b28a3 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_188.txt @@ -0,0 +1 @@ +Otherwise, the [Trainer] won't save the weights in fp16 and it won't create a pytorch_model.bin file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_189.txt b/chunked/content_aware_chunking/_deepspeed/chunk_189.txt new file mode 100644 index 0000000000000000000000000000000000000000..68aa899c021ae4e20c4eaa732818578fadcb7334 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_189.txt @@ -0,0 +1 @@ +This is because DeepSpeed's state_dict contains a placeholder instead of the real weights and you won't be able to load them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_190.txt b/chunked/content_aware_chunking/_deepspeed/chunk_190.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c2dbe13530438b5072c2e137e97f8c26aacd827 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_190.txt @@ -0,0 +1,8 @@ +yaml +{ + "zero_optimization": { + "stage3_gather_16bit_weights_on_model_save": true + } +} + +The full precision weights shouldn't be saved during training because it can require a lot of memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_191.txt b/chunked/content_aware_chunking/_deepspeed/chunk_191.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d5bd68838b9d7f6a20a216ed829411517858494 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_191.txt @@ -0,0 +1 @@ +It is usually best to save the fp32 weights offline after training is complete. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_192.txt b/chunked/content_aware_chunking/_deepspeed/chunk_192.txt new file mode 100644 index 0000000000000000000000000000000000000000..6166e682cb5501ebeb26061e2452607f5a6f4326 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_192.txt @@ -0,0 +1 @@ +But if you have a lot of free CPU memory, it is possible to save the fp32 weights during training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_193.txt b/chunked/content_aware_chunking/_deepspeed/chunk_193.txt new file mode 100644 index 0000000000000000000000000000000000000000..58d033d0a9dfa83f4df6f11d56d7d48f962f4e3c --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_193.txt @@ -0,0 +1 @@ +This section covers both online and offline approaches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_194.txt b/chunked/content_aware_chunking/_deepspeed/chunk_194.txt new file mode 100644 index 0000000000000000000000000000000000000000..b274a5d4e0e405c81036dcb414fa05e86cfb351b --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_194.txt @@ -0,0 +1,9 @@ +Online +You must have saved at least one checkpoint to load the latest checkpoint as shown in the following: + +from transformers.trainer_utils import get_last_checkpoint +from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint +checkpoint_dir = get_last_checkpoint(trainer.args.output_dir) +fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + +If you've enabled the --load_best_model_at_end parameter to track the best checkpoint in [TrainingArguments], you can finish training first and save the final model explicitly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_195.txt b/chunked/content_aware_chunking/_deepspeed/chunk_195.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb16fe0679e3057906ccb59bd5c2652257cd42da --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_195.txt @@ -0,0 +1,8 @@ +Then you can reload it as shown below: + +from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint +checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final") +trainer.deepspeed.save_checkpoint(checkpoint_dir) +fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + +Once load_state_dict_from_zero_checkpoint is run, the model is no longer usable in DeepSpeed in the context of the same application. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_196.txt b/chunked/content_aware_chunking/_deepspeed/chunk_196.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bc5e1baa81d3f632c1076cc03fcc560f459efe3 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_196.txt @@ -0,0 +1 @@ +You'll need to initialize the DeepSpeed engine again since model.load_state_dict(state_dict) removes all the DeepSpeed magic from it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_197.txt b/chunked/content_aware_chunking/_deepspeed/chunk_197.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea449340d5a8c3cb902d3513542da9c0c150cb5d --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_197.txt @@ -0,0 +1 @@ +Only use this at the very end of training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_198.txt b/chunked/content_aware_chunking/_deepspeed/chunk_198.txt new file mode 100644 index 0000000000000000000000000000000000000000..08aae38d0504e91b1b51ea6167ca0c7c9661ea6c --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_198.txt @@ -0,0 +1,9 @@ +You can also extract and load the state_dict of the fp32 weights: + +from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint +state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu +model = model.cpu() +model.load_state_dict(state_dict) + +Offline +DeepSpeed provides a zero_to_fp32.py script at the top-level of the checkpoint folder for extracting weights at any point. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_199.txt b/chunked/content_aware_chunking/_deepspeed/chunk_199.txt new file mode 100644 index 0000000000000000000000000000000000000000..79dbd9ab748b2a20f0a88ec124fc9e57f24d2a90 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_199.txt @@ -0,0 +1 @@ +This is a standalone script and you don't need a configuration file or [Trainer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_200.txt b/chunked/content_aware_chunking/_deepspeed/chunk_200.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b74d93ec44223fa07a216e69bf4360bd265d0e6 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_200.txt @@ -0,0 +1,16 @@ +For example, if your checkpoint folder looked like this: + +$ ls -l output_dir/checkpoint-1/ +-rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json +drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/ +-rw-rw-r-- 1 stas stas 12 Mar 27 13:16 latest +-rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt +-rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin +-rw-rw-r-- 1 stas stas 623 Mar 27 20:42 scheduler.pt +-rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json +-rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model +-rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json +-rw-rw-r-- 1 stas stas 339 Mar 27 20:42 trainer_state.json +-rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin +-rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py* +To reconstruct the fp32 weights from the DeepSpeed checkpoint (ZeRO-2 or ZeRO-3) subfolder global_step1, run the following command to create and consolidate the full fp32 weights from multiple GPUs into a single pytorch_model.bin file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_201.txt b/chunked/content_aware_chunking/_deepspeed/chunk_201.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f1793c2825ccf78774581ebf737b5010bcb6696 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_201.txt @@ -0,0 +1 @@ +The script automatically discovers the subfolder containing the checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_202.txt b/chunked/content_aware_chunking/_deepspeed/chunk_202.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2a1aefd6f31b31150200a6ea7af21a6ae3d88ea --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_202.txt @@ -0,0 +1,2 @@ +py +python zero_to_fp32.py . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_203.txt b/chunked/content_aware_chunking/_deepspeed/chunk_203.txt new file mode 100644 index 0000000000000000000000000000000000000000..628c479cb6c763603c97698c0badd8a844871de2 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_203.txt @@ -0,0 +1,3 @@ +pytorch_model.bin + +Run python zero_to_fp32.py -h for more usage details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_204.txt b/chunked/content_aware_chunking/_deepspeed/chunk_204.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab8884b5cdfc7e359b602b7ef645a9531ccd15ed --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_204.txt @@ -0,0 +1 @@ +The script requires 2x the general RAM of the final fp32 weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_205.txt b/chunked/content_aware_chunking/_deepspeed/chunk_205.txt new file mode 100644 index 0000000000000000000000000000000000000000..d535f3790d8f2b6d962341e6ccf07ff67d457189 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_205.txt @@ -0,0 +1,2 @@ +ZeRO Inference +ZeRO Inference places the model weights in CPU or NVMe memory to avoid burdening the GPU which makes it possible to run inference with huge models on a GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_206.txt b/chunked/content_aware_chunking/_deepspeed/chunk_206.txt new file mode 100644 index 0000000000000000000000000000000000000000..33ac83454d197a3f78c8003d52a1fbf766ec87b0 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_206.txt @@ -0,0 +1 @@ +Inference doesn't require any large additional amounts of memory for the optimizer states and gradients so you can fit much larger batches and/or sequence lengths on the same hardware. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_207.txt b/chunked/content_aware_chunking/_deepspeed/chunk_207.txt new file mode 100644 index 0000000000000000000000000000000000000000..25464cdad9d2136008db1f015442e3f7d44f54d9 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_207.txt @@ -0,0 +1 @@ +ZeRO Inference shares the same configuration file as ZeRO-3, and ZeRO-2 and ZeRO-1 configs won't work because they don't provide any benefits for inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_208.txt b/chunked/content_aware_chunking/_deepspeed/chunk_208.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4e07ac6876fcfec8f8e71fe4b9418a933428de5 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_208.txt @@ -0,0 +1 @@ +To run ZeRO Inference, pass your usual training arguments to the [TrainingArguments] class and add the --do_eval argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_209.txt b/chunked/content_aware_chunking/_deepspeed/chunk_209.txt new file mode 100644 index 0000000000000000000000000000000000000000..067bc24d2700ddacb3eb3249c6218693b5f2c391 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_209.txt @@ -0,0 +1,3 @@ +deepspeed --num_gpus=2 your_program.py --do_eval --deepspeed ds_config.json +Non-Trainer DeepSpeed integration +DeepSpeed also works with Transformers without the [Trainer] class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_210.txt b/chunked/content_aware_chunking/_deepspeed/chunk_210.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d4344d5943b85bd68e894ebf50560c49d8ed93b --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_210.txt @@ -0,0 +1 @@ +This is handled by the [HfDeepSpeedConfig] which only takes care of gathering ZeRO-3 parameters and splitting a model across multiple GPUs when you call [~PreTrainedModel.from_pretrained]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_211.txt b/chunked/content_aware_chunking/_deepspeed/chunk_211.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1953568614b9156e1868ae1694427e0c3aff558 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_211.txt @@ -0,0 +1 @@ +If you want everything automatically taken care of for you, try using DeepSpeed with the [Trainer]! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_212.txt b/chunked/content_aware_chunking/_deepspeed/chunk_212.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca95de5e9953d06ab3a82e7817ea2b68aad48d1a --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_212.txt @@ -0,0 +1 @@ +You'll need to follow the DeepSpeed documentation, and manually configure the parameter values in the config file (you can't use the "auto" value). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_213.txt b/chunked/content_aware_chunking/_deepspeed/chunk_213.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8fa42490f1f8a13a8301b7f64be521aa62b5e1d --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_213.txt @@ -0,0 +1,12 @@ +To efficiently deploy ZeRO-3, you must instantiate the [HfDeepSpeedConfig] object before the model and keep that object alive: + +from transformers.integrations import HfDeepSpeedConfig +from transformers import AutoModel +import deepspeed +ds_config = {} # deepspeed config object or path to the file +must run before instantiating the model to detect zero 3 +dschf = HfDeepSpeedConfig(ds_config) # keep this object alive +model = AutoModel.from_pretrained("openai-community/gpt2") +engine = deepspeed.initialize(model=model, config_params=ds_config, ) + +[HfDeepSpeedConfig] is not required for ZeRO-1 or ZeRO-2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_214.txt b/chunked/content_aware_chunking/_deepspeed/chunk_214.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c9550b3d3833cdbb78d8a08152da0f196c3e73f --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_214.txt @@ -0,0 +1,12 @@ +from transformers.integrations import HfDeepSpeedConfig +from transformers import AutoModel, AutoConfig +import deepspeed +ds_config = {} # deepspeed config object or path to the file +must run before instantiating the model to detect zero 3 +dschf = HfDeepSpeedConfig(ds_config) # keep this object alive +config = AutoConfig.from_pretrained("openai-community/gpt2") +model = AutoModel.from_config(config) +engine = deepspeed.initialize(model=model, config_params=ds_config, ) + +Non-Trainer ZeRO Inference +To run ZeRO Inference without the [Trainer] in cases where you can’t fit a model onto a single GPU, try using additional GPUs or/and offloading to CPU memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_215.txt b/chunked/content_aware_chunking/_deepspeed/chunk_215.txt new file mode 100644 index 0000000000000000000000000000000000000000..5daeda664e767cebd77a46eb3d81734346229741 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_215.txt @@ -0,0 +1 @@ +The important nuance to understand here is that the way ZeRO is designed, you can process different inputs on different GPUs in parallel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_216.txt b/chunked/content_aware_chunking/_deepspeed/chunk_216.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ccaa2f7c5c20f7a7e8822fca92ce8b96a43e70b --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_216.txt @@ -0,0 +1,3 @@ +Make sure to: + +disable CPU offload if you have enough GPU memory (since it slows things down). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_217.txt b/chunked/content_aware_chunking/_deepspeed/chunk_217.txt new file mode 100644 index 0000000000000000000000000000000000000000..e458c5a8d0adaaaac0a6515b093dff1ec760cb77 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_217.txt @@ -0,0 +1 @@ +enable bf16 if you have an Ampere or newer GPU to make things faster. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_218.txt b/chunked/content_aware_chunking/_deepspeed/chunk_218.txt new file mode 100644 index 0000000000000000000000000000000000000000..41e9ae62cb01a0674e55fe39acf74b1d17653b74 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_218.txt @@ -0,0 +1 @@ +If you don’t have one of these GPUs, you may enable fp16 as long as you don’t use a model pretrained in bf16 (T5 models) because it may lead to an overflow error. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_219.txt b/chunked/content_aware_chunking/_deepspeed/chunk_219.txt new file mode 100644 index 0000000000000000000000000000000000000000..87257ed0daf24e52ae74ee0b50790f2fd88e5e8b --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_219.txt @@ -0,0 +1 @@ +Take a look at the following script to get a better idea of how to run ZeRO Inference without the [Trainer] on a model that won't fit on a single GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_220.txt b/chunked/content_aware_chunking/_deepspeed/chunk_220.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f114b7736c63cecdeeca2ad4435bf73bd2b3f18 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_220.txt @@ -0,0 +1,5 @@ +!/usr/bin/env python +This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model +into a single GPU + +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_221.txt b/chunked/content_aware_chunking/_deepspeed/chunk_221.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce14e06cd0d285fc0b3299af8bbb87e938989bb7 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_221.txt @@ -0,0 +1,2 @@ +Use 1 GPU with CPU offload +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_222.txt b/chunked/content_aware_chunking/_deepspeed/chunk_222.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6b8e5830cdd2056c3adfa9f22a4151880395bc7 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_222.txt @@ -0,0 +1,6 @@ +Or use multiple GPUs instead + +First you need to install deepspeed: pip install deepspeed + +Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2 +small GPUs can handle it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_223.txt b/chunked/content_aware_chunking/_deepspeed/chunk_223.txt new file mode 100644 index 0000000000000000000000000000000000000000..a61098f98a57f0b041394acfc332fb1b77cd9c5a --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_223.txt @@ -0,0 +1 @@ +or 1 small GPU and a lot of CPU memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_224.txt b/chunked/content_aware_chunking/_deepspeed/chunk_224.txt new file mode 100644 index 0000000000000000000000000000000000000000..091d3ea92793255a5020aea7cb37c2a35e49974a --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_224.txt @@ -0,0 +1,2 @@ +To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU - +you will need 2-4 gpus. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_225.txt b/chunked/content_aware_chunking/_deepspeed/chunk_225.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d3340ded6597ce3b27fe29846346f217e588c7e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_225.txt @@ -0,0 +1,2 @@ +And then you can adapt the script to handle more gpus if you want to +process multiple inputs at once. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_226.txt b/chunked/content_aware_chunking/_deepspeed/chunk_226.txt new file mode 100644 index 0000000000000000000000000000000000000000..62e29f40a9d915a4df2b81020f84088135687c42 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_226.txt @@ -0,0 +1,3 @@ +The provided deepspeed config also activates CPU memory offloading, so chances are that if you +have a lot of available CPU memory and you don't mind a slowdown you should be able to load a +model that doesn't normally fit into a single GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_227.txt b/chunked/content_aware_chunking/_deepspeed/chunk_227.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ad1b3e6dd68cddef8ae8e43af427118e4ffd88b --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_227.txt @@ -0,0 +1,2 @@ +If you have enough GPU memory the program will +run faster if you don't want offload to CPU - so disable that section then. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_228.txt b/chunked/content_aware_chunking/_deepspeed/chunk_228.txt new file mode 100644 index 0000000000000000000000000000000000000000..689a2f40ebce3c74f4d4e1004af14697d0cb3a27 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_228.txt @@ -0,0 +1,31 @@ +To deploy on 1 gpu: + +deepspeed --num_gpus 1 t0.py +or: +python -m torch.distributed.run --nproc_per_node=1 t0.py + +To deploy on 2 gpus: + +deepspeed --num_gpus 2 t0.py +or: +python -m torch.distributed.run --nproc_per_node=2 t0.py +from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM +from transformers.integrations import HfDeepSpeedConfig +import deepspeed +import os +import torch +os.environ["TOKENIZERS_PARALLELISM"] = "false" # To avoid warnings about parallelism in tokenizers +distributed setup +local_rank = int(os.getenv("LOCAL_RANK", "0")) +world_size = int(os.getenv("WORLD_SIZE", "1")) +torch.cuda.set_device(local_rank) +deepspeed.init_distributed() +model_name = "bigscience/T0_3B" +config = AutoConfig.from_pretrained(model_name) +model_hidden_size = config.d_model +batch size has to be divisible by world_size, but can be bigger than world_size +train_batch_size = 1 * world_size +ds_config notes + +- enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be +faster. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_229.txt b/chunked/content_aware_chunking/_deepspeed/chunk_229.txt new file mode 100644 index 0000000000000000000000000000000000000000..02e9d421ac39f4e9da4bf43b2b42616851f71422 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_229.txt @@ -0,0 +1 @@ +- for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_230.txt b/chunked/content_aware_chunking/_deepspeed/chunk_230.txt new file mode 100644 index 0000000000000000000000000000000000000000..609dd75d3eee2c71e9ca20ade3640c8c60e01b3f --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_230.txt @@ -0,0 +1,39 @@ +all official t5 models are bf16-pretrained + +- set offload_param.device to "none" or completely remove the offload_param section if you don't +- want CPU offload + +- if using offload_param you can manually finetune stage3_param_persistence_threshold to control +- which params should remain on gpus - the larger the value the smaller the offload size + +For in-depth info on Deepspeed config see +https://huggingface.co/docs/transformers/main/main_classes/deepspeed +keeping the same format as json for consistency, except it uses lower case for true/false +fmt: off +ds_config = { + "fp16": { + "enabled": False + }, + "bf16": { + "enabled": False + }, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "cpu", + "pin_memory": True + }, + "overlap_comm": True, + "contiguous_gradients": True, + "reduce_bucket_size": model_hidden_size * model_hidden_size, + "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, + "stage3_param_persistence_threshold": 10 * model_hidden_size + }, + "steps_per_print": 2000, + "train_batch_size": train_batch_size, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": False +} +fmt: on +next line instructs transformers to partition the model directly over multiple gpus using +deepspeed.zero.Init when model's from_pretrained method is called. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_231.txt b/chunked/content_aware_chunking/_deepspeed/chunk_231.txt new file mode 100644 index 0000000000000000000000000000000000000000..98c3c805aab06361fff63ccb49b2df17c7d8a1a1 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_231.txt @@ -0,0 +1,6 @@ +it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name) + +otherwise the model will first be loaded normally and only partitioned at forward time which is +less efficient and when there is little CPU RAM may fail +dschf = HfDeepSpeedConfig(ds_config) # keep this object alive +now a model can be loaded. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_232.txt b/chunked/content_aware_chunking/_deepspeed/chunk_232.txt new file mode 100644 index 0000000000000000000000000000000000000000..240e1ddbd999fe11625baf6f7e2c18f9d199f923 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_232.txt @@ -0,0 +1,5 @@ +model = AutoModelForSeq2SeqLM.from_pretrained(model_name) +initialise Deepspeed ZeRO and store only the engine object +ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] +ds_engine.module.eval() # inference +Deepspeed ZeRO can process unrelated inputs on each GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_233.txt b/chunked/content_aware_chunking/_deepspeed/chunk_233.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1b152a36e3b2380d266cd6a9c1f90f3b454055f --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_233.txt @@ -0,0 +1 @@ +So for 2 gpus you process 2 inputs at once. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_234.txt b/chunked/content_aware_chunking/_deepspeed/chunk_234.txt new file mode 100644 index 0000000000000000000000000000000000000000..4180c266623e4e59b59fe3d151f4e2717fa31e9e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_234.txt @@ -0,0 +1 @@ +If you use more GPUs adjust for more. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_235.txt b/chunked/content_aware_chunking/_deepspeed/chunk_235.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3486db1076ee897175669e78a11fdca4e2a0de3 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_235.txt @@ -0,0 +1,5 @@ +And of course if you have just one input to process you then need to pass the same string to both gpus +If you use only one GPU, then you will have only rank 0. +rank = torch.distributed.get_rank() +if rank == 0: + text_in = "Is this review positive or negative? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_236.txt b/chunked/content_aware_chunking/_deepspeed/chunk_236.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a626c583ae024dfaa0a3553df78b9fab8fc354f --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_236.txt @@ -0,0 +1,3 @@ +Review: this is the best cast iron skillet you will ever buy" +elif rank == 1: + text_in = "Is this review positive or negative? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_237.txt b/chunked/content_aware_chunking/_deepspeed/chunk_237.txt new file mode 100644 index 0000000000000000000000000000000000000000..568375e9faad734bbb487a98f7bed318ea5c633f --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_237.txt @@ -0,0 +1,13 @@ +Review: this is the worst restaurant ever" +tokenizer = AutoTokenizer.from_pretrained(model_name) +inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank) +with torch.no_grad(): + outputs = ds_engine.module.generate(inputs, synced_gpus=True) +text_out = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(f"rank{rank}:\n in={text_in}\n out={text_out}") + +Save the script as t0.py and launch it: + +$ deepspeed --num_gpus 2 t0.py +rank0: + in=Is this review positive or negative? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_238.txt b/chunked/content_aware_chunking/_deepspeed/chunk_238.txt new file mode 100644 index 0000000000000000000000000000000000000000..01c52f85d4fd1386f6a32650127db5ef2273cfa3 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_238.txt @@ -0,0 +1,4 @@ +Review: this is the best cast iron skillet you will ever buy + out=Positive +rank1: + in=Is this review positive or negative? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_239.txt b/chunked/content_aware_chunking/_deepspeed/chunk_239.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a41133e3680fb3b988050d112a6cbc306244639 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_239.txt @@ -0,0 +1,3 @@ +Review: this is the worst restaurant ever + out=negative +This is a very basic example and you'll want to adapt it to your use case. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_240.txt b/chunked/content_aware_chunking/_deepspeed/chunk_240.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b1675345714f023a80cfc0ab301a549638926ce --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_240.txt @@ -0,0 +1,2 @@ +Generate +Using multiple GPUs with ZeRO-3 for generation requires synchronizing the GPUs by setting synced_gpus=True in the [~GenerationMixin.generate] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_241.txt b/chunked/content_aware_chunking/_deepspeed/chunk_241.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0cae88bf7e2ac25a5703e5fd4999dc3f61f7103 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_241.txt @@ -0,0 +1 @@ +Otherwise, if one GPU is finished generating before another one, the whole system hangs because the remaining GPUs haven't received the weight shard from the GPU that finished first. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_242.txt b/chunked/content_aware_chunking/_deepspeed/chunk_242.txt new file mode 100644 index 0000000000000000000000000000000000000000..0423c76b71de20639a5eae195a37eee79c715247 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_242.txt @@ -0,0 +1 @@ +For Transformers>=4.28, if synced_gpus is automatically set to True if multiple GPUs are detected during generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_243.txt b/chunked/content_aware_chunking/_deepspeed/chunk_243.txt new file mode 100644 index 0000000000000000000000000000000000000000..68b103857d8828509e9db0a67d889ce135f729fc --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_243.txt @@ -0,0 +1,2 @@ +Troubleshoot +When you encounter an issue, you should consider whether DeepSpeed is the cause of the problem because often it isn't (unless it's super obviously and you can see DeepSpeed modules in the exception)! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_244.txt b/chunked/content_aware_chunking/_deepspeed/chunk_244.txt new file mode 100644 index 0000000000000000000000000000000000000000..31fcff1b02849c6cfc7aad6a7ae78471149d73c7 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_244.txt @@ -0,0 +1 @@ +The first step should be to retry your setup without DeepSpeed, and if the problem persists, then you can report the issue. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_245.txt b/chunked/content_aware_chunking/_deepspeed/chunk_245.txt new file mode 100644 index 0000000000000000000000000000000000000000..d23931ba70e8f3effb6cacc24d49d9f8e3e6bbd1 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_245.txt @@ -0,0 +1 @@ +If the issue is a core DeepSpeed problem and unrelated to the Transformers integration, open an Issue on the DeepSpeed repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_246.txt b/chunked/content_aware_chunking/_deepspeed/chunk_246.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d00c6cc429d03180bb28ae73897134b98b02253 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_246.txt @@ -0,0 +1,17 @@ +For issues related to the Transformers integration, please provide the following information: + +the full DeepSpeed config file + +the command line arguments of the [Trainer], or [TrainingArguments] arguments if you're scripting the [Trainer] setup yourself (don't dump the [TrainingArguments] which has dozens of irrelevant entries) + +the outputs of: + +python -c 'import torch; print(f"torch: {torch.__version__}")' +python -c 'import transformers; print(f"transformers: {transformers.__version__}")' +python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")' + +a link to a Google Colab notebook to reproduce the issue + +if impossible, a standard and non-custom dataset we can use and also try to use an existing example to reproduce the issue with + +The following sections provide a guide for resolving two of the most common issues. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_247.txt b/chunked/content_aware_chunking/_deepspeed/chunk_247.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a7383f5ab1943f1c61c85f84b2f4ca4123d0eac --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_247.txt @@ -0,0 +1,2 @@ +DeepSpeed process killed at startup +When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than your system has or your process tried to allocate more CPU memory than allowed leading the OS kernel to terminate the process. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_248.txt b/chunked/content_aware_chunking/_deepspeed/chunk_248.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cc740435bdb269d1d0f4a85e9d36c9acc08bac3 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_248.txt @@ -0,0 +1 @@ +In this case, check whether your configuration file has either offload_optimizer, offload_param or both configured to offload to the CPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_249.txt b/chunked/content_aware_chunking/_deepspeed/chunk_249.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b08b2ddd4c5408711b49d237a88d839066c1cb0 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_249.txt @@ -0,0 +1 @@ +If you have NVMe and ZeRO-3 setup, experiment with offloading to the NVMe (estimate the memory requirements for your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_250.txt b/chunked/content_aware_chunking/_deepspeed/chunk_250.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad7ec49bddf99aa1bb87785effd7d59cfd08f89 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_250.txt @@ -0,0 +1,2 @@ +NaN loss +NaN loss often occurs when a model is pretrained in bf16 and then you try to use it with fp16 (especially relevant for TPU trained models). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_251.txt b/chunked/content_aware_chunking/_deepspeed/chunk_251.txt new file mode 100644 index 0000000000000000000000000000000000000000..83034be6800994550a0c8cd75183c107eec76f30 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_251.txt @@ -0,0 +1 @@ +To resolve this, use fp32 or bf16 if your hardware supports it (TPU, Ampere GPUs or newer). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_252.txt b/chunked/content_aware_chunking/_deepspeed/chunk_252.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fde88ce34ffcf99b4462b0bb98ffe3d8ae9ba39 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_252.txt @@ -0,0 +1 @@ +The other issue may be related to using fp16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_253.txt b/chunked/content_aware_chunking/_deepspeed/chunk_253.txt new file mode 100644 index 0000000000000000000000000000000000000000..1924b6b02b0ec8da9885b7422ff8f60cb3c0f0a5 --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_253.txt @@ -0,0 +1,13 @@ +For example, if this is your fp16 configuration: +yaml +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + } +} +You might see the following OVERFLOW! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_deepspeed/chunk_254.txt b/chunked/content_aware_chunking/_deepspeed/chunk_254.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dad5e808aa3e0256f2de3c5747178fff21bec1e --- /dev/null +++ b/chunked/content_aware_chunking/_deepspeed/chunk_254.txt @@ -0,0 +1,4 @@ +messages in the logs: + +0%| | 0/189 [00:00 len(list2): + results.extend(list1[i+1:]) + else: + results.extend(list2[i+1:]) + return results + +""" +`` +For demonstration purposes, we duplicate the system prompt by ten so that the input length is long enough to observe Flash Attention's memory savings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_134.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a3e2355efd9c619e380e1c9c93210402470e8d6 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_134.txt @@ -0,0 +1,4 @@ +We append the original text prompt"Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here"` +python +long_prompt = 10 * system_prompt + prompt +We instantiate our model again in bfloat16 precision. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_135.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..af7cd2638de7f62e87cc3b70919ff311509b5fb8 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_135.txt @@ -0,0 +1,6 @@ +thon +model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder") +pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + +Let's now run the model just like before without Flash Attention and measure the peak GPU memory requirement and inference time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_136.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..2202fff91e19699fa7a52382adc3945e6433467c --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_136.txt @@ -0,0 +1,5 @@ +thon +import time +start_time = time.time() +result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):] +print(f"Generated in {time.time() - start_time} seconds.") \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_137.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..259623816e5024ace4eb9623b9d71659a6f67d9a --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_137.txt @@ -0,0 +1,5 @@ +result + +Output: + +Generated in 10.96854019165039 seconds. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_138.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b3483cfedb34ff832fb04eaac426586a0fca55c --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_138.txt @@ -0,0 +1 @@ +Sure. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_139.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..40bf948f8a64dfd3ffee41e031bc2e56e23b8947 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_139.txt @@ -0,0 +1 @@ +Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_140.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..59721de1d6bfb62ed7c64461114a0530c9e10d84 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_140.txt @@ -0,0 +1,3 @@ +Here is a function that does that.\n\ndef +` +We're getting the same output as before, however this time, the model repeats the answer multiple times until it's 60 tokens cut-off. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_141.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..6da491543b1603e5091bb1306f83f26474fa1f2b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_141.txt @@ -0,0 +1 @@ +This is not surprising as we've repeated the system prompt ten times for demonstration purposes and thus cued the model to repeat itself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_142.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..24b33f735d8bd88247992d58b784b57152cba805 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_142.txt @@ -0,0 +1 @@ +Note that the system prompt should not be repeated ten times in real-world applications - one time is enough! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_143.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9e36c7ac384d4730aadc26fdea5bfdc0c14d3cd --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_143.txt @@ -0,0 +1 @@ +Let's measure the peak GPU memory requirement. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_144.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..d218862b801051deaf101fb22060db4b4ca5fa81 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_144.txt @@ -0,0 +1,6 @@ +python +bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) +Output: + +37.668193340301514 +As we can see the peak GPU memory requirement is now significantly higher than in the beginning, which is largely due to the longer input sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_145.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1929bad140447f7c24786a88855c546d41cdb02 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_145.txt @@ -0,0 +1 @@ +Also the generation takes a little over a minute now. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_146.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e86a8725a8fc42719684a88e26f41fed684d045 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_146.txt @@ -0,0 +1 @@ +We call flush() to free GPU memory for our next experiment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_147.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..1636a3f931925cd77aade985f59294d70852542f --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_147.txt @@ -0,0 +1,3 @@ +python +flush() +For comparison, let's run the same function, but enable Flash Attention instead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_148.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e6a91da062935a013898180a9045f80257382bd --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_148.txt @@ -0,0 +1 @@ +To do so, we convert the model to BetterTransformer and by doing so enabling PyTorch's SDPA self-attention which in turn is able to use Flash Attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_149.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..30940939265463e3a66b964de537b9c89d727efe --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_149.txt @@ -0,0 +1,3 @@ +python +model.to_bettertransformer() +Now we run the exact same code snippet as before and under the hood Transformers will make use of Flash Attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_150.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..be4cf5819b32b6fc08f67212a57c5e87bda4bc6b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_150.txt @@ -0,0 +1,4 @@ +start_time = time.time() +with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):] +print(f"Generated in {time.time() - start_time} seconds.") \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_151.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b60610c9db994d6276f55e41ed19701a690a06f --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_151.txt @@ -0,0 +1,4 @@ +result + +Output: +Generated in 3.0211617946624756 seconds. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_152.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b3483cfedb34ff832fb04eaac426586a0fca55c --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_152.txt @@ -0,0 +1 @@ +Sure. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_153.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..40bf948f8a64dfd3ffee41e031bc2e56e23b8947 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_153.txt @@ -0,0 +1 @@ +Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_154.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d496a401fa6bb8a9786a14966be5fd34c098262 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_154.txt @@ -0,0 +1,2 @@ +Here is a function that does that.\n\ndef +We're getting the exact same result as before, but can observe a very significant speed-up thanks to Flash Attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_155.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..9336b4eeca7ee514f134a3f2e0edf3024eff3caa --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_155.txt @@ -0,0 +1 @@ +Let's measure the memory consumption one last time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_156.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..c019cd8b6a77900fedbe6bb04443a900f5b7c02a --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_156.txt @@ -0,0 +1,5 @@ +python +bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) +Output: +32.617331981658936 +And we're almost back to our original 29GB peak GPU memory from the beginning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_157.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ef694b64246d7b0128b04dcb18703508d889ea5 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_157.txt @@ -0,0 +1 @@ +We can observe that we only use roughly 100MB more GPU memory when passing a very long input sequence with Flash Attention compared to passing a short input sequence as done in the beginning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_158.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..89af3963135a77d7d31cbd884e70d0e08acd5dd4 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_158.txt @@ -0,0 +1,3 @@ +py +flush() +For more information on how to use Flash Attention, please have a look at this doc page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_159.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_159.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_160.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..114f8b7e3cb2d764a819775edac1a8bd35747211 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_160.txt @@ -0,0 +1,7 @@ +Architectural Innovations +So far we have looked into improving computational and memory efficiency by: + +Casting the weights to a lower precision format +Replacing the self-attention algorithm with a more memory- and compute efficient version + +Let's now look into how we can change the architecture of an LLM so that it is most effective and efficient for task that require long text inputs, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_161.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..3127753795c7b3e86ff6e20173073d0c3393f61b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_161.txt @@ -0,0 +1,5 @@ +: +- Retrieval augmented Questions Answering, +- Summarization, +- Chat +Note that chat not only requires the LLM to handle long text inputs, but it also necessitates that the LLM is able to efficiently handle the back-and-forth dialogue between user and assistant (such as ChatGPT). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_162.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..06e78aec9a87987ddc29c4fcca3425cf8cde8e03 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_162.txt @@ -0,0 +1 @@ +Once trained, the fundamental LLM architecture is difficult to change, so it is important to make considerations about the LLM's tasks beforehand and accordingly optimize the model's architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_163.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ac6a36cecd35299fc2d58eba1d6912649593b94 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_163.txt @@ -0,0 +1 @@ +There are two important components of the model architecture that quickly become memory and/or performance bottlenecks for large input sequences. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_164.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..17f665281e4b6f37c21d43ba34b75a74c9445546 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_164.txt @@ -0,0 +1,6 @@ +The positional embeddings +The key-value cache + +Let's go over each component in more detail +3.1 Improving positional embeddings of LLMs +Self-attention puts each token in relation to each other's tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_165.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ab395f0330c568fb424068c46e0ee270449facd --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_165.txt @@ -0,0 +1,3 @@ +As an example, the \( \text{Softmax}(\mathbf{QK}^T) \) matrix of the text input sequence "Hello", "I", "love", "you" could look as follows: + +Each word token is given a probability mass at which it attends all other word tokens and, therefore is put into relation with all other word tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_166.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c6c42b6981712ccfb35860e2422a687ff0e1372 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_166.txt @@ -0,0 +1 @@ +E.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_167.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba52e7e310a0cd90fe88a151bf111e203ef1244f --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_167.txt @@ -0,0 +1 @@ +the word "love" attends to the word "Hello" with 5%, to "I" with 30%, and to itself with 65%. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_168.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5974c1f24fc4be516cfe0d9c1c2348555ca966e --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_168.txt @@ -0,0 +1 @@ +A LLM based on self-attention, but without position embeddings would have great difficulties in understanding the positions of the text inputs to each other. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_169.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..92f7fb3d7f34e3d49a635dbb793f3a9118b9f678 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_169.txt @@ -0,0 +1 @@ +This is because the probability score computed by \( \mathbf{QK}^T \) relates each word token to each other word token in \( O(1) \) computations regardless of their relative positional distance to each other. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_170.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..42b1615c4bc4554d4dc406b989b1702a8c0a22c0 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_170.txt @@ -0,0 +1 @@ +Therefore, for the LLM without position embeddings each token appears to have the same distance to all other tokens, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_171.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..c904a1b2ca85ada87b847cd0bf8eed7146280f52 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_171.txt @@ -0,0 +1 @@ +differentiating between "Hello I love you" and "You love I hello" would be very challenging. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_172.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..3710d7cb225a2a5c7d4787329e425c9c04a844cb --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_172.txt @@ -0,0 +1 @@ +For the LLM to understand sentence order, an additional cue is needed and is usually applied in the form of positional encodings (or also called positional embeddings). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_173.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f89a194319b3f7118f451bdca75a2d295f4e020 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_173.txt @@ -0,0 +1 @@ +Positional encodings, encode the position of each token into a numerical presentation that the LLM can leverage to better understand sentence order. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_174.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..b37ca0a9d65a5ef94bcd09976d6f57f8094bf586 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_174.txt @@ -0,0 +1 @@ +The authors of the Attention Is All You Need paper introduced sinusoidal positional embeddings \( \mathbf{P} = \mathbf{p}_1, \ldots, \mathbf{p}_N \) . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_175.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1c89ff7871280bce829fcdfb25f46002d3da510 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_175.txt @@ -0,0 +1 @@ +where each vector \( \mathbf{p}_i \) is computed as a sinusoidal function of its position \( i \) . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_176.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7515f45ad5f0212ed8d5e442203020ede94b5e8 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_176.txt @@ -0,0 +1 @@ +The positional encodings are then simply added to the input sequence vectors \( \mathbf{\hat{X}} = \mathbf{\hat{x}}_1, \ldots, \mathbf{\hat{x}}_N \) = \( \mathbf{x}_1 + \mathbf{p}_1, \ldots, \mathbf{x}_N + \mathbf{p}_N \) thereby cueing the model to better learn sentence order. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_177.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec92e3d26d4b4d5b8035daf29c672486b3f2bf3f --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_177.txt @@ -0,0 +1 @@ +Instead of using fixed position embeddings, others (such as Devlin et al.) \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_178.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a568337a58d91d973be0295385e5a8c84209930 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_178.txt @@ -0,0 +1,2 @@ +used learned positional encodings for which the positional embeddings +\( \mathbf{P} \) are learned during training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_179.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..de224516fe1ad811e288cb97f5b5a1694cff9854 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_179.txt @@ -0,0 +1,3 @@ +Sinusoidal and learned position embeddings used to be the predominant methods to encode sentence order into LLMs, but a couple of problems related to these positional encodings were found: + +Sinusoidal and learned position embeddings are both absolute positional embeddings, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_180.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..2231492487adc8cdf047f6f6b9f68a06b4d4dc12 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_180.txt @@ -0,0 +1 @@ +encoding a unique embedding for each position id: \( 0, \ldots, N \) . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_181.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5de0e3d9b2bc550596c48aedd5b172fb359fa30 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_181.txt @@ -0,0 +1 @@ +As shown by Huang et al. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_182.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..c972bca8da386c26377e425e2412564b14ad2e2b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_182.txt @@ -0,0 +1 @@ +and Su et al., absolute positional embeddings lead to poor LLM performance for long text inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_183.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e48b2fc4f03239e7e42733ec4a6814ba0516035 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_183.txt @@ -0,0 +1 @@ +For long text inputs, it is advantageous if the model learns the relative positional distance input tokens have to each other instead of their absolute position. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_184.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3925efc750e1bcea38c391e75581fa8bb7650df --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_184.txt @@ -0,0 +1 @@ +When using learned position embeddings, the LLM has to be trained on a fixed input length \( N \), which makes it difficult to extrapolate to an input length longer than what it was trained on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_185.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac1bcadbd2c1cce69a45b8b31f33613eaa91f586 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_185.txt @@ -0,0 +1,6 @@ +Recently, relative positional embeddings that can tackle the above mentioned problems have become more popular, most notably: + +Rotary Position Embedding (RoPE) +ALiBi + +Both RoPE and ALiBi argue that it's best to cue the LLM about sentence order directly in the self-attention algorithm as it's there that word tokens are put into relation with each other. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_186.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..c154b5f982763c6e6530dd2ec4d9ca48e286270f --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_186.txt @@ -0,0 +1 @@ +More specifically, sentence order should be cued by modifying the \( \mathbf{QK}^T \) computation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_187.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..0633d979184c3ddf90212afb6ba020fb575cfab8 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_187.txt @@ -0,0 +1 @@ +Without going into too many details, RoPE notes that positional information can be encoded into query-key pairs, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_188.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..6abab8a94df6955e6665a264d4a302e854ad182b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_188.txt @@ -0,0 +1,2 @@ +\( \mathbf{q}_i \) and \( \mathbf{x}_j \) by rotating each vector by an angle \( \theta * i \) and \( \theta * j \) respectively with \( i, j \) describing each vectors sentence position: +$$ \mathbf{\hat{q}}i^T \mathbf{\hat{x}}_j = \mathbf{{q}}_i^T \mathbf{R}{\theta, i -j} \mathbf{{x}}_j. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_189.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_189.txt new file mode 100644 index 0000000000000000000000000000000000000000..81700878221bacbf300acd8960f386680c73e303 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_189.txt @@ -0,0 +1,2 @@ +$$ +\( \mathbf{R}_{\theta, i - j} \) thereby represents a rotational matrix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_190.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_190.txt new file mode 100644 index 0000000000000000000000000000000000000000..418f693216405d7d4721773e4e13e66e61038f5d --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_190.txt @@ -0,0 +1 @@ +\( \theta \) is not learned during training, but instead set to a pre-defined value that depends on the maximum input sequence length during training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_191.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_191.txt new file mode 100644 index 0000000000000000000000000000000000000000..e86a9e82bd53417e817cbea1c1a07254e524fa74 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_191.txt @@ -0,0 +1 @@ +By doing so, the propability score between \( \mathbf{q}_i \) and \( \mathbf{q}_j \) is only affected if \( i \ne j \) and solely depends on the relative distance \( i - j \) regardless of each vector's specific positions \( i \) and \( j \) . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_192.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_192.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f3bf51d70addb4833fe04a2441f52107c679c4b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_192.txt @@ -0,0 +1,7 @@ +RoPE is used in multiple of today's most important LLMs, such as: + +Falcon +Llama +PaLM + +As an alternative, ALiBi proposes a much simpler relative position encoding scheme. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_193.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_193.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc739447122b66ee0cf74e4a69002d235594c83e --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_193.txt @@ -0,0 +1 @@ +The relative distance that input tokens have to each other is added as a negative integer scaled by a pre-defined value m to each query-key entry of the \( \mathbf{QK}^T \) matrix right before the softmax computation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_194.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_194.txt new file mode 100644 index 0000000000000000000000000000000000000000..00d3aaf79f5a12075dd5f9ced3671aa2c228b179 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_194.txt @@ -0,0 +1 @@ +As shown in the ALiBi paper, this simple relative positional encoding allows the model to retain a high performance even at very long text input sequences. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_195.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_195.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e7a8a939590e9645935d74ce34047018a7ea429 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_195.txt @@ -0,0 +1,6 @@ +ALiBi is used in multiple of today's most important LLMs, such as: + +MPT +BLOOM + +Both RoPE and ALiBi position encodings can extrapolate to input lengths not seen during training whereas it has been shown that extrapolation works much better out-of-the-box for ALiBi as compared to RoPE. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_196.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_196.txt new file mode 100644 index 0000000000000000000000000000000000000000..267c61744ac4f6e95dbd032fe15f559744c28aa4 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_196.txt @@ -0,0 +1 @@ +For ALiBi, one simply increases the values of the lower triangular position matrix to match the length of the input sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_197.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_197.txt new file mode 100644 index 0000000000000000000000000000000000000000..d386646e8331a1529686bd0df07b3c3975a6cd1b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_197.txt @@ -0,0 +1 @@ +For RoPE, keeping the same \( \theta \) that was used during training leads to poor results when passing text inputs much longer than those seen during training, c.f Press et al.. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_198.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_198.txt new file mode 100644 index 0000000000000000000000000000000000000000..1046a71534a784fd10640912e6c4d9992bd7cdc4 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_198.txt @@ -0,0 +1 @@ +However, the community has found a couple of effective tricks that adapt \( \theta \), thereby allowing RoPE position embeddings to work well for extrapolated text input sequences (see here). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_199.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_199.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ee5ddecc11648b9280efa55016bcabfd62d4e87 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_199.txt @@ -0,0 +1,4 @@ +Both RoPE and ALiBi are relative positional embeddings that are not learned during training, but instead are based on the following intuitions: + - Positional cues about the text inputs should be given directly to the \( QK^T \) matrix of the self-attention layer + - The LLM should be incentivized to learn a constant relative distance positional encodings have to each other + - The further text input tokens are from each other, the lower the probability of their query-value probability. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_200.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_200.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccbe79ff270c1f814383c4bf1348566890dc5fc4 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_200.txt @@ -0,0 +1 @@ +Both RoPE and ALiBi lower the query-key probability of tokens far away from each other. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_201.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_201.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ac7c46725c2d6b86a77e8139a20aae1fb77efa2 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_201.txt @@ -0,0 +1 @@ +RoPE by decreasing their vector product by increasing the angle between the query-key vectors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_202.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_202.txt new file mode 100644 index 0000000000000000000000000000000000000000..277a2f31a7be5eee1bbd4a226f181ca64fa86ed1 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_202.txt @@ -0,0 +1,3 @@ +ALiBi by adding large negative numbers to the vector product + +In conclusion, LLMs that are intended to be deployed in tasks that require handling large text inputs are better trained with relative positional embeddings, such as RoPE and ALiBi. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_203.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_203.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bba4e5d7dcf9e01e04bb3fd083e016f2336e283 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_203.txt @@ -0,0 +1 @@ +Also note that even if an LLM with RoPE and ALiBi has been trained only on a fixed length of say \( N_1 = 2048 \) it can still be used in practice with text inputs much larger than \( N_1 \), like \( N_2 = 8192 > N_1 \) by extrapolating the positional embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_204.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_204.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a1eb79277e5012544fb6c1b49440ac4327a9a2 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_204.txt @@ -0,0 +1,2 @@ +3.2 The key-value cache +Auto-regressive text generation with LLMs works by iteratively putting in an input sequence, sampling the next token, appending the next token to the input sequence, and continuing to do so until the LLM produces a token that signifies that the generation has finished. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_205.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_205.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c8fcac53602d61832013ce3b67443e065202896 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_205.txt @@ -0,0 +1 @@ +Please have a look at Transformer's Generate Text Tutorial to get a more visual explanation of how auto-regressive generation works. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_206.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_206.txt new file mode 100644 index 0000000000000000000000000000000000000000..37c61cf54ef32e3febfc1e46e5069c5ae8c7c179 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_206.txt @@ -0,0 +1 @@ +Let's run a quick code snippet to show how auto-regressive works in practice. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_207.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_207.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1108e6b683de6a4bc6b79ee6cd2b388dd7c9e6e --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_207.txt @@ -0,0 +1 @@ +We will simply take the most likely next token via torch.argmax. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_208.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_208.txt new file mode 100644 index 0000000000000000000000000000000000000000..28465a6e3a918c6319a9140525ad18b132625488 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_208.txt @@ -0,0 +1,18 @@ +thon +input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda") +for _ in range(5): + next_logits = model(input_ids)["logits"][:, -1:] + next_token_id = torch.argmax(next_logits,dim=-1) +input_ids = torch.cat([input_ids, next_token_id], dim=-1) + print("shape of input_ids", input_ids.shape) +generated_text = tokenizer.batch_decode(input_ids[:, -5:]) +generated_text + +Output: +shape of input_ids torch.Size([1, 21]) +shape of input_ids torch.Size([1, 22]) +shape of input_ids torch.Size([1, 23]) +shape of input_ids torch.Size([1, 24]) +shape of input_ids torch.Size([1, 25]) +[' Here is a Python function'] +As we can see every time we increase the text input tokens by the just sampled token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_209.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_209.txt new file mode 100644 index 0000000000000000000000000000000000000000..608cf94bad607a96cb38861e7c453a56becf6107 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_209.txt @@ -0,0 +1 @@ +With very few exceptions, LLMs are trained using the causal language modeling objective and therefore mask the upper triangle matrix of the attention score - this is why in the two diagrams above the attention scores are left blank (a.k.a have 0 probability). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_210.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_210.txt new file mode 100644 index 0000000000000000000000000000000000000000..66ebf58286bb953dd6ceee2a8d00c40b98049f3d --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_210.txt @@ -0,0 +1 @@ +For a quick recap on causal language modeling you can refer to the Illustrated Self Attention blog. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_211.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_211.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a3d79b9416231eca822145c4d1ed03b438d1bb9 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_211.txt @@ -0,0 +1 @@ +As a consequence, tokens never depend on previous tokens, more specifically the \( \mathbf{q}i \) vector is never put in relation with any key, values vectors \( \mathbf{k}_j, \mathbf{v}_j \) if \( j > i \) . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_212.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_212.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f6aacbd84e61cd183e3422a597c0f5e9fe6a41c --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_212.txt @@ -0,0 +1 @@ +Instead \( \mathbf{q}_i \) only attends to previous key-value vectors \( \mathbf{k}{m < i}, \mathbf{v}_{m < i} \text{ , for } m \in {0, \ldots i - 1} \). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_213.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_213.txt new file mode 100644 index 0000000000000000000000000000000000000000..35279c58e7b2d1eea4f22417eca8ff6354679efb --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_213.txt @@ -0,0 +1 @@ +In order to reduce unnecessary computation, one can therefore cache each layer's key-value vectors for all previous timesteps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_214.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_214.txt new file mode 100644 index 0000000000000000000000000000000000000000..66245a944e6450415e50a20e5f3a6681b2ea7360 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_214.txt @@ -0,0 +1 @@ +In the following, we will tell the LLM to make use of the key-value cache by retrieving and forwarding it for each forward pass. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_215.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_215.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c89760069a5f747bebe338d45d2e85cde8cdba1 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_215.txt @@ -0,0 +1 @@ +In Transformers, we can retrieve the key-value cache by passing the use_cache flag to the forward call and can then pass it with the current token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_216.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_216.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d49e991528f1e0b0087ef0f42b83849b6d61112 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_216.txt @@ -0,0 +1,27 @@ +thon +past_key_values = None # past_key_values is the key-value cache +generated_tokens = [] +next_token_id = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda") +for _ in range(5): + next_logits, past_key_values = model(next_token_id, past_key_values=past_key_values, use_cache=True).to_tuple() + next_logits = next_logits[:, -1:] + next_token_id = torch.argmax(next_logits, dim=-1) +print("shape of input_ids", next_token_id.shape) + print("length of key-value cache", len(past_key_values[0][0])) # past_key_values are of shape [num_layers, 0 for k, 1 for v, batch_size, length, hidden_dim] + generated_tokens.append(next_token_id.item()) +generated_text = tokenizer.batch_decode(generated_tokens) +generated_text + +Output: +shape of input_ids torch.Size([1, 1]) +length of key-value cache 20 +shape of input_ids torch.Size([1, 1]) +length of key-value cache 21 +shape of input_ids torch.Size([1, 1]) +length of key-value cache 22 +shape of input_ids torch.Size([1, 1]) +length of key-value cache 23 +shape of input_ids torch.Size([1, 1]) +length of key-value cache 24 +[' Here', ' is', ' a', ' Python', ' function'] +As one can see, when using the key-value cache the text input tokens are not increased in length, but remain a single input vector. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_217.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_217.txt new file mode 100644 index 0000000000000000000000000000000000000000..0006a26bec06b67c1591af7d2da76885027349ab --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_217.txt @@ -0,0 +1 @@ +The length of the key-value cache on the other hand is increased by one at every decoding step. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_218.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_218.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc0d802a320c21d5aedceb9fb97f9698f696bfeb --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_218.txt @@ -0,0 +1 @@ +Making use of the key-value cache means that the \( \mathbf{QK}^T \) is essentially reduced to \( \mathbf{q}_c\mathbf{K}^T \) with \( \mathbf{q}_c \) being the query projection of the currently passed input token which is always just a single vector. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_219.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_219.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d5441ce8b16aac7966b380557195197e2dab25d --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_219.txt @@ -0,0 +1,2 @@ +Using the key-value cache has two advantages: +- Significant increase in computational efficiency as less computations are performed compared to computing the full \( \mathbf{QK}^T \) matrix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_220.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_220.txt new file mode 100644 index 0000000000000000000000000000000000000000..4909770c637abb667137faaededa8bd8314d7051 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_220.txt @@ -0,0 +1,2 @@ +This leads to an increase in inference speed +- The maximum required memory is not increased quadratically with the number of generated tokens, but only increases linearly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_221.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_221.txt new file mode 100644 index 0000000000000000000000000000000000000000..267e2fda0c9cdc8b054d2ccbf6df49865abf1c5c --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_221.txt @@ -0,0 +1 @@ +One should always make use of the key-value cache as it leads to identical results and a significant speed-up for longer input sequences. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_222.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_222.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c0f4da467cba6c29bb4302e5583f5622acd4cea --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_222.txt @@ -0,0 +1 @@ +Transformers has the key-value cache enabled by default when making use of the text pipeline or the generate method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_223.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_223.txt new file mode 100644 index 0000000000000000000000000000000000000000..12656a3f559ab45c4ca14b728493f91d6b73a152 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_223.txt @@ -0,0 +1 @@ +Note that, despite our advice to use key-value caches, your LLM output may be slightly different when you use them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_224.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_224.txt new file mode 100644 index 0000000000000000000000000000000000000000..1963f6ae114698c64bb44ca641fa0550c875f4f1 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_224.txt @@ -0,0 +1 @@ +This is a property of the matrix multiplication kernels themselves -- you can read more about it here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_225.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_225.txt new file mode 100644 index 0000000000000000000000000000000000000000..92ba9eaa01a152489781b1ed0bdd0fb9660e6531 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_225.txt @@ -0,0 +1,2 @@ +3.2.1 Multi-round conversation +The key-value cache is especially useful for applications such as chat where multiple passes of auto-regressive decoding are required. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_226.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_226.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd8c2e67fb79273190177f3c472e857ac1dbfe70 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_226.txt @@ -0,0 +1 @@ +Let's look at an example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_227.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_227.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ab60f89371d349d37a9bcc43e18ee92237a4b60 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_227.txt @@ -0,0 +1 @@ +User: How many people live in France? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_228.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_228.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4b6aaf684df5ff49d2bfe8c4acebf86c53498c1 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_228.txt @@ -0,0 +1,2 @@ +Assistant: Roughly 75 million people live in France +User: And how many are in Germany? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_229.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_229.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd664c2d626f8337c8fbcda52e3e3aa0c19fca3d --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_229.txt @@ -0,0 +1 @@ +Assistant: Germany has ca. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_230.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_230.txt new file mode 100644 index 0000000000000000000000000000000000000000..fade0102d75c4ae110bee71c9b742047e377c00f --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_230.txt @@ -0,0 +1,3 @@ +81 million inhabitants +In this chat, the LLM runs auto-regressive decoding twice: + 1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_231.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_231.txt new file mode 100644 index 0000000000000000000000000000000000000000..d69c345b1a1ac06cb611d79336a5df429b547ba0 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_231.txt @@ -0,0 +1 @@ +The first time, the key-value cache is empty and the input prompt is "User: How many people live in France?" \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_232.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_232.txt new file mode 100644 index 0000000000000000000000000000000000000000..438dd8d431de2dd8a172888c2633e45558057211 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_232.txt @@ -0,0 +1 @@ +and the model auto-regressively generates the text "Roughly 75 million people live in France" while increasing the key-value cache at every decoding step. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_233.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_233.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_233.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_234.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_234.txt new file mode 100644 index 0000000000000000000000000000000000000000..65b967f38653679dc76e8f91c81a0ddaa212c465 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_234.txt @@ -0,0 +1 @@ +The second time the input prompt is "User: How many people live in France? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_235.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_235.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9131c65faff2d226989b2827f911a229deda8a6 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_235.txt @@ -0,0 +1 @@ +\n Assistant: Roughly 75 million people live in France \n User: And how many in Germany?". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_236.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_236.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7b6ecb26bf4e8fe207ac3da50c8554350e0b70b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_236.txt @@ -0,0 +1 @@ +Thanks to the cache, all key-value vectors for the first two sentences are already computed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_237.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_237.txt new file mode 100644 index 0000000000000000000000000000000000000000..84184044ecbcf6543b87577240eebc24099be244 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_237.txt @@ -0,0 +1 @@ +Therefore the input prompt only consists of "User: And how many in Germany?". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_238.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_238.txt new file mode 100644 index 0000000000000000000000000000000000000000..7486f49b52fa6ea9342fac877b123c63e3b10cf7 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_238.txt @@ -0,0 +1 @@ +While processing the shortened input prompt, it's computed key-value vectors are concatenated to the key-value cache of the first decoding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_239.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_239.txt new file mode 100644 index 0000000000000000000000000000000000000000..38d28ce0fb24708edfba26e7d8edcb72a0240c3b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_239.txt @@ -0,0 +1 @@ +The second Assistant's answer "Germany has ca. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_240.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_240.txt new file mode 100644 index 0000000000000000000000000000000000000000..a05d9b0998c391449ebc98c45aab6326c263b386 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_240.txt @@ -0,0 +1 @@ +81 million inhabitants" is then auto-regressively generated with the key-value cache consisting of encoded key-value vectors of "User: How many people live in France? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_241.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_241.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a204873b1ae96781f970fe2fe8b96aa0836ead7 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_241.txt @@ -0,0 +1 @@ +\n Assistant: Roughly 75 million people live in France \n User: And how many are in Germany?". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_242.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_242.txt new file mode 100644 index 0000000000000000000000000000000000000000..712b36593f4554fa192fea1199794abbff99d7f3 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_242.txt @@ -0,0 +1,2 @@ +Two things should be noted here: + 1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_243.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_243.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c89e0e6bfe3625e3450fa124b584e7874eb1aec --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_243.txt @@ -0,0 +1 @@ +Keeping all the context is crucial for LLMs deployed in chat so that the LLM understands all the previous context of the conversation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_244.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_244.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c6c42b6981712ccfb35860e2422a687ff0e1372 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_244.txt @@ -0,0 +1 @@ +E.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_245.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_245.txt new file mode 100644 index 0000000000000000000000000000000000000000..25404fc410b8ac8a6de661bb85eb591bf3a44c53 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_245.txt @@ -0,0 +1 @@ +for the example above the LLM needs to understand that the user refers to the population when asking "And how many are in Germany". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_246.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_246.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_246.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_247.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_247.txt new file mode 100644 index 0000000000000000000000000000000000000000..461479c7fc57b18110f8c7b44de4e4c76a955eac --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_247.txt @@ -0,0 +1 @@ +The key-value cache is extremely useful for chat as it allows us to continuously grow the encoded chat history instead of having to re-encode the chat history again from scratch (as e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_248.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_248.txt new file mode 100644 index 0000000000000000000000000000000000000000..b63c3d4626380267bbb62c5433b85f521c7fdcbe --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_248.txt @@ -0,0 +1 @@ +would be the case when using an encoder-decoder architecture). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_249.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_249.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a6ad0542e8acce3480e5fe1853a6b4e378aede4 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_249.txt @@ -0,0 +1 @@ +In transformers, a generate call will return past_key_values when return_dict_in_generate=True is passed, in addition to the default use_cache=True. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_250.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_250.txt new file mode 100644 index 0000000000000000000000000000000000000000..791fd32feda2dd24a60757f7bcc0ebc04f16d232 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_250.txt @@ -0,0 +1 @@ +Note that it is not yet available through the pipeline interface. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_251.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_251.txt new file mode 100644 index 0000000000000000000000000000000000000000..a26f24633a39942017647be376a9eaf3bccf37b3 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_251.txt @@ -0,0 +1,20 @@ +thon +Generation as usual +prompt = system_prompt + "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here" +model_inputs = tokenizer(prompt, return_tensors='pt') +generation_output = model.generate(**model_inputs, max_new_tokens=60, return_dict_in_generate=True) +decoded_output = tokenizer.batch_decode(generation_output.sequences)[0] +Piping the returned past_key_values to speed up the next conversation round +prompt = decoded_output + "\nQuestion: How can I modify the function above to return Mega bytes instead?\n\nAnswer: Here" +model_inputs = tokenizer(prompt, return_tensors='pt') +generation_output = model.generate( + **model_inputs, + past_key_values=generation_output.past_key_values, + max_new_tokens=60, + return_dict_in_generate=True +) +tokenizer.batch_decode(generation_output.sequences)[0][len(prompt):] + +Output: + + is a modified version of the function that returns Mega bytes instead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_252.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_252.txt new file mode 100644 index 0000000000000000000000000000000000000000..41e01667498fee602211ef9c05ac5c48b91a43d6 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_252.txt @@ -0,0 +1,5 @@ +def bytes_to_megabytes(bytes): + return bytes / 1024 / 1024 +Answer: The function takes a number of bytes as input and returns the number of + +Great, no additional time is spent recomputing the same key and values for the attention layer! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_253.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_253.txt new file mode 100644 index 0000000000000000000000000000000000000000..09b83c30c80f299e1a2287c4d13d94726512a38f --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_253.txt @@ -0,0 +1 @@ +There is however one catch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_254.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_254.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b52d451a0db23331837894f8fc4808cf1c8e9ff --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_254.txt @@ -0,0 +1 @@ +While the required peak memory for the \( \mathbf{QK}^T \) matrix is significantly reduced, holding the key-value cache in memory can become very memory expensive for long input sequences or multi-turn chat. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_255.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_255.txt new file mode 100644 index 0000000000000000000000000000000000000000..4577c35db62ced42f8e75056fd351ba380ddd404 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_255.txt @@ -0,0 +1 @@ +Remember that the key-value cache needs to store the key-value vectors for all previous input vectors \( \mathbf{x}_i \text{, for } i \in {1, \ldots, c - 1} \) for all self-attention layers and for all attention heads. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_256.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_256.txt new file mode 100644 index 0000000000000000000000000000000000000000..6780d54882d67ae87e44dfd549fe9f2931b6451a --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_256.txt @@ -0,0 +1 @@ +Let's compute the number of float values that need to be stored in the key-value cache for the LLM bigcode/octocoder that we used before. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_257.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_257.txt new file mode 100644 index 0000000000000000000000000000000000000000..de9ead2d291f6966f6ed0d769d6b97e7e6d44335 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_257.txt @@ -0,0 +1 @@ +The number of float values amounts to two times the sequence length times the number of attention heads times the attention head dimension and times the number of layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_258.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_258.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c9f6240eaedf277aadfa69e0b535ed1227f7a32 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_258.txt @@ -0,0 +1,7 @@ +Computing this for our LLM at a hypothetical input sequence length of 16000 gives: +python +config = model.config +2 * 16_000 * config.n_layer * config.n_head * config.n_embd // config.n_head +Output: +7864320000 +Roughly 8 billion float values! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_259.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_259.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad923555bcdc36adc902d8d69cc071820fa879d5 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_259.txt @@ -0,0 +1 @@ +Storing 8 billion float values in float16 precision requires around 15 GB of RAM which is circa half as much as the model weights themselves! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_260.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_260.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6e62710b1ef99c785bf2b06475642a129cc370a --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_260.txt @@ -0,0 +1 @@ +Researchers have proposed two methods that allow to significantly reduce the memory cost of storing the key-value cache, which are explored in the next subsections. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_261.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_261.txt new file mode 100644 index 0000000000000000000000000000000000000000..c08a28a4528aa9a3049384218e44833103f9861e --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_261.txt @@ -0,0 +1,2 @@ +3.2.2 Multi-Query-Attention (MQA) +Multi-Query-Attention was proposed in Noam Shazeer's Fast Transformer Decoding: One Write-Head is All You Need paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_262.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_262.txt new file mode 100644 index 0000000000000000000000000000000000000000..e82a556c7f9ecdb17b72de5f9bab2eb9c8fd717b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_262.txt @@ -0,0 +1 @@ +As the title says, Noam found out that instead of using n_head key-value projections weights, one can use a single head-value projection weight pair that is shared across all attention heads without that the model's performance significantly degrades. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_263.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_263.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea57ade8b93978444d6e8c827b7f23663b4c986e --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_263.txt @@ -0,0 +1 @@ +By using a single head-value projection weight pair, the key value vectors \( \mathbf{k}_i, \mathbf{v}_i \) have to be identical across all attention heads which in turn means that we only need to store 1 key-value projection pair in the cache instead of n_head ones. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_264.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_264.txt new file mode 100644 index 0000000000000000000000000000000000000000..3604e1391b318e694f3a85c25c565cfc5d3328ad --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_264.txt @@ -0,0 +1 @@ +As most LLMs use between 20 and 100 attention heads, MQA significantly reduces the memory consumption of the key-value cache. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_265.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_265.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6ae2be67c8f54b1495037959d4c4b30b4bba0d1 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_265.txt @@ -0,0 +1 @@ +For the LLM used in this notebook we could therefore reduce the required memory consumption from 15 GB to less than 400 MB at an input sequence length of 16000. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_266.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_266.txt new file mode 100644 index 0000000000000000000000000000000000000000..32166f76ceb4d86f7f8e7e6eb329a696f01beec3 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_266.txt @@ -0,0 +1 @@ +In addition to memory savings, MQA also leads to improved computational efficiency as explained in the following. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_267.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_267.txt new file mode 100644 index 0000000000000000000000000000000000000000..014b349364a8b5bd88df667d880ca26f85d50fd1 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_267.txt @@ -0,0 +1 @@ +In auto-regressive decoding, large key-value vectors need to be reloaded, concatenated with the current key-value vector pair to be then fed into the \( \mathbf{q}_c\mathbf{K}^T \) computation at every step. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_268.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_268.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca17e261b9a78b722ff5db86a21b954e30f36e37 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_268.txt @@ -0,0 +1 @@ +For auto-regressive decoding, the required memory bandwidth for the constant reloading can become a serious time bottleneck. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_269.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_269.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b644418424aed53d2a183240748354276e7014d --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_269.txt @@ -0,0 +1 @@ +By reducing the size of the key-value vectors less memory needs to be accessed, thus reducing the memory bandwidth bottleneck. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_270.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_270.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5d3c894bea7888fcb562812ca0d675970fba6e6 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_270.txt @@ -0,0 +1 @@ +For more detail, please have a look at Noam's paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_271.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_271.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbe7940e8f0f00d8b5bc49c896745546912f5688 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_271.txt @@ -0,0 +1 @@ +The important part to understand here is that reducing the number of key-value attention heads to 1 only makes sense if a key-value cache is used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_272.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_272.txt new file mode 100644 index 0000000000000000000000000000000000000000..12b1e76a8781d38c33ac202f4cd3cc9a1b9c5b09 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_272.txt @@ -0,0 +1 @@ +The peak memory consumption of the model for a single forward pass without key-value cache stays unchanged as every attention head still has a unique query vector so that each attention head still has a different \( \mathbf{QK}^T \) matrix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_273.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_273.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b4188f34c9a85b92d0ca4ae0698b3b68e2d2878 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_273.txt @@ -0,0 +1,8 @@ +MQA has seen wide adoption by the community and is now used by many of the most popular LLMs: + +Falcon +PaLM +MPT +BLOOM + +Also, the checkpoint used in this notebook - bigcode/octocoder - makes use of MQA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_274.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_274.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9163d7f72ee223157db7d4a4c85ffb8835c94c6 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_274.txt @@ -0,0 +1,2 @@ +3.2.3 Grouped-Query-Attention (GQA) +Grouped-Query-Attention, as proposed by Ainslie et al. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_275.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_275.txt new file mode 100644 index 0000000000000000000000000000000000000000..27314004f663280778c99913f291012be0dd5f8f --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_275.txt @@ -0,0 +1 @@ +from Google, found that using MQA can often lead to quality degradation compared to using vanilla multi-key-value head projections. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_276.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_276.txt new file mode 100644 index 0000000000000000000000000000000000000000..176da623b9b06783ad0119be0060db6a003b9709 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_276.txt @@ -0,0 +1 @@ +The paper argues that more model performance can be kept by less drastically reducing the number of query head projection weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_277.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_277.txt new file mode 100644 index 0000000000000000000000000000000000000000..4253b0541230b17dbf11b1878c8f43ec3489b332 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_277.txt @@ -0,0 +1 @@ +Instead of using just a single key-value projection weight, n < n_head key-value projection weights should be used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_278.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_278.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5a227a7a131e74b66a06c28cc80226326530991 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_278.txt @@ -0,0 +1 @@ +By choosing n to a significantly smaller value than n_head, such as 2,4 or 8 almost all of the memory and speed gains from MQA can be kept while sacrificing less model capacity and thus arguably less performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_279.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_279.txt new file mode 100644 index 0000000000000000000000000000000000000000..098789122ffde43de6867c704c88ce3d4f179fbc --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_279.txt @@ -0,0 +1 @@ +Moreover, the authors of GQA found out that existing model checkpoints can be uptrained to have a GQA architecture with as little as 5% of the original pre-training compute. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_280.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_280.txt new file mode 100644 index 0000000000000000000000000000000000000000..64464d38f6f2659d289e6027a6293ce21445a7d7 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_280.txt @@ -0,0 +1 @@ +While 5% of the original pre-training compute can still be a massive amount, GQA uptraining allows existing checkpoints to be useful for longer input sequences. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_281.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_281.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c1003430cdd1bc0ae649bb865a59929f3a754cd --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_281.txt @@ -0,0 +1 @@ +GQA was only recently proposed which is why there is less adoption at the time of writing this notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_282.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_282.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbe78cb48af42df08124a764a625ff817c6923b2 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_282.txt @@ -0,0 +1 @@ +The most notable application of GQA is Llama-v2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_283.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_283.txt new file mode 100644 index 0000000000000000000000000000000000000000..32fd9bc17386098dda83b0c8e00710c2e42d10e2 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_283.txt @@ -0,0 +1 @@ +As a conclusion, it is strongly recommended to make use of either GQA or MQA if the LLM is deployed with auto-regressive decoding and is required to handle large input sequences as is the case for example for chat. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_284.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_284.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ce15e9893c3a39ec280d2e4e7ab22e09b4eba8d --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_284.txt @@ -0,0 +1,2 @@ +Conclusion +The research community is constantly coming up with new, nifty ways to speed up inference time for ever-larger LLMs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_285.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_285.txt new file mode 100644 index 0000000000000000000000000000000000000000..19c14f3a003b2e19852198cff8ef3e4c0fc60cb1 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_285.txt @@ -0,0 +1 @@ +As an example, one such promising research direction is speculative decoding where "easy tokens" are generated by smaller, faster language models and only "hard tokens" are generated by the LLM itself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_286.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_286.txt new file mode 100644 index 0000000000000000000000000000000000000000..698b0b8a89a301321542c87c5da0e25e6f53ca13 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_286.txt @@ -0,0 +1 @@ +Going into more detail is out of the scope of this notebook, but can be read upon in this nice blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_287.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_287.txt new file mode 100644 index 0000000000000000000000000000000000000000..e056dcda231d62b426b7cc26d5a1d858153d582c --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_287.txt @@ -0,0 +1 @@ +The reason massive LLMs such as GPT3/4, Llama-2-70b, Claude, PaLM can run so quickly in chat-interfaces such as Hugging Face Chat or ChatGPT is to a big part thanks to the above-mentioned improvements in precision, algorithms, and architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_288.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_288.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6bca2da77cb58efd04256f506333bb63da76d13 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_288.txt @@ -0,0 +1 @@ +Going forward, accelerators such as GPUs, TPUs, etc will only get faster and allow for more memory, but one should nevertheless always make sure to use the best available algorithms and architectures to get the most bang for your buck 🤗 \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_76.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e8a9e0371c1141cc9f5573298e9e6b47bd9365a --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_76.txt @@ -0,0 +1 @@ +While we see very little degradation in accuracy for our model here, 4-bit quantization can in practice often lead to different results compared to 8-bit quantization or full bfloat16 inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_77.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bc7b3e1f07f3257414bf6bb3cb6166c7869b684 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_77.txt @@ -0,0 +1 @@ +It is up to the user to try it out. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_78.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbf3ef11cd91f667fa53dc43ce18e32140e02fa5 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_78.txt @@ -0,0 +1 @@ +Also note that inference here was again a bit slower compared to 8-bit quantization which is due to the more aggressive quantization method used for 4-bit quantization leading to \( \text{quantize} \) and \( \text{dequantize} \) taking longer during inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_79.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..c966ea03e72973fa76e11b92c3a17613851d422d --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_79.txt @@ -0,0 +1,6 @@ +python +del model +del pipe +python +flush() +Overall, we saw that running OctoCoder in 8-bit precision reduced the required GPU VRAM from 32G GPU VRAM to only 15GB and running the model in 4-bit precision further reduces the required GPU VRAM to just a bit over 9GB. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_80.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..a54cfbd6d073b14bc576088b07a46f47fbaded67 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_80.txt @@ -0,0 +1 @@ +4-bit quantization allows the model to be run on GPUs such as RTX3090, V100, and T4 which are quite accessible for most people. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_81.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..14265392e2cd937c5ea2ef19a2ad46aa89f9e7e0 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_81.txt @@ -0,0 +1 @@ +For more information on quantization and to see how one can quantize models to require even less GPU VRAM memory than 4-bit, we recommend looking into the AutoGPTQ implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_82.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ea57f221dbf074a7a4ceab1ed60171e32c3081e --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_82.txt @@ -0,0 +1 @@ +As a conclusion, it is important to remember that model quantization trades improved memory efficiency against accuracy and in some cases inference time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_83.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..fac3e9ad6745a64f12d8e568288c89942abf493b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_83.txt @@ -0,0 +1 @@ +If GPU memory is not a constraint for your use case, there is often no need to look into quantization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_84.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..69cba49ff6a99a451910c0e113ed0c0f5ea2a845 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_84.txt @@ -0,0 +1 @@ +However many GPUs simply can't run LLMs without quantization methods and in this case, 4-bit and 8-bit quantization schemes are extremely useful tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_85.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..80360fffd7d61136cc89806ea19d54f98d1c158a --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_85.txt @@ -0,0 +1 @@ +For more in-detail usage information, we strongly recommend taking a look at the Transformers Quantization Docs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_86.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..da3edab659bfe29055aec5347d24598135337a6a --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_86.txt @@ -0,0 +1 @@ +Next, let's look into how we can improve computational and memory efficiency by using better algorithms and an improved model architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_87.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_87.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_88.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..d35c99d6b8bfcba5f1be752e762972b0a12649e1 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_88.txt @@ -0,0 +1,2 @@ +Flash Attention +Today's top-performing LLMs share more or less the same fundamental architecture that consists of feed-forward layers, activation layers, layer normalization layers, and most crucially, self-attention layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_89.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..62afb900d6307adb5077c8b16ac45eb57e3beaf3 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_89.txt @@ -0,0 +1 @@ +Self-attention layers are central to Large Language Models (LLMs) in that they enable the model to understand the contextual relationships between input tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_90.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..22d271d3b7055dd4518d306fec935c1c385ed418 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_90.txt @@ -0,0 +1 @@ +However, the peak GPU memory consumption for self-attention layers grows quadratically both in compute and memory complexity with number of input tokens (also called sequence length) that we denote in the following by \( N \) . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_91.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0667b18d3a01dc2659220ebfe0ecbe5eacd752b --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_91.txt @@ -0,0 +1 @@ +While this is not really noticeable for shorter input sequences (of up to 1000 input tokens), it becomes a serious problem for longer input sequences (at around 16000 input tokens). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_92.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..1142e84f3070f76d96646ba6551227dcf1bcec6c --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_92.txt @@ -0,0 +1 @@ +Let's take a closer look. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_93.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..7463537b41d474517b90cc4c6b0223f00f850f60 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_93.txt @@ -0,0 +1,3 @@ +The formula to compute the output \( \mathbf{O} \) of a self-attention layer for an input \( \mathbf{X} \) of length \( N \) is: +$$ \textbf{O} = \text{Attn}(\mathbf{X}) = \mathbf{V} \times \text{Softmax}(\mathbf{QK}^T) \text{ with } \mathbf{Q} = \mathbf{W}_q \mathbf{X}, \mathbf{V} = \mathbf{W}_v \mathbf{X}, \mathbf{K} = \mathbf{W}_k \mathbf{X} $$ +\( \mathbf{X} = (\mathbf{x}1, \mathbf{x}{N}) \) is thereby the input sequence to the attention layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_94.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2a6f26bbed753eb0e3a282245a8db1d3e45c601 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_94.txt @@ -0,0 +1 @@ +The projections \( \mathbf{Q} \) and \( \mathbf{K} \) will each consist of \( N \) vectors resulting in the \( \mathbf{QK}^T \) being of size \( N^2 \) . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_95.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..e99046cfe476a5ba7872fd0d2dc8cf1daacd079a --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_95.txt @@ -0,0 +1 @@ +LLMs usually have multiple attention heads, thus doing multiple self-attention computations in parallel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_96.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2cc6dac582d700967f906570c860b8f094d4457 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_96.txt @@ -0,0 +1 @@ +Assuming, the LLM has 40 attention heads and runs in bfloat16 precision, we can calculate the memory requirement to store the \( \mathbf{QK^T} \) matrices to be \( 40 * 2 * N^2 \) bytes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_97.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..775affa6f43edaad9386bab8592e02f8098472e8 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_97.txt @@ -0,0 +1 @@ +For \( N=1000 \) only around 50 MB of VRAM are needed, however, for \( N=16000 \) we would need 19 GB of VRAM, and for \( N=100,000 \) we would need almost 1TB just to store the \( \mathbf{QK}^T \) matrices. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_98.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..6be32d671abe6b401db76b7827eb3a32da11d154 --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_98.txt @@ -0,0 +1 @@ +Long story short, the default self-attention algorithm quickly becomes prohibitively memory-expensive for large input contexts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_99.txt b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..227e12ea4f84227f972152f95bd918df92bb002e --- /dev/null +++ b/chunked/content_aware_chunking/_llm_tutorial_optimization/chunk_99.txt @@ -0,0 +1 @@ +As LLMs improve in text comprehension and generation, they are applied to increasingly complex tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_22.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8aaa5fa33f75e0f19f9b86fe123a638c3e5c1400 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_22.txt @@ -0,0 +1,10 @@ +Now we can also quickly check if we get the same result +as with nvidia-smi CLI: + +nvidia-smi +```bash +Tue Jan 11 08:58:05 2022 ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 460.91.03 Driver Version: 460.91.03 CUDA Version: 11.2 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_23.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..d07a1402451ea27902c93ea97650b0e60fc0566e --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_23.txt @@ -0,0 +1,17 @@ +ECC | +| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|===============================+======================+======================| +| 0 Tesla V100-SXM2 On | 00000000:00:04.0 Off | 0 | +| N/A 37C P0 39W / 300W | 2631MiB / 16160MiB | 0% Default | +| | | N/A | ++-------------------------------+----------------------+----------------------+ ++-----------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=============================================================================| +| 0 N/A N/A 3721 C nvs/codeparrot/bin/python 2629MiB | ++-----------------------------------------------------------------------------+ + +We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_24.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e78d218bbdb0112c68c88d2d79fe50d27621470 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_24.txt @@ -0,0 +1,2 @@ +So now we can +start training the model and see how the GPU memory consumption changes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_25.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..50d9a2bd4dd9fcf826b6e5756d4de13396b83aac --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_25.txt @@ -0,0 +1,13 @@ +First, we set up a few standard training +arguments: +py +default_args = { + "output_dir": "tmp", + "evaluation_strategy": "steps", + "num_train_epochs": 1, + "log_level": "error", + "report_to": "none", +} + +If you plan to run multiple experiments, in order to properly clear the memory between experiments, restart the Python + kernel between experiments. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_26.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..85ee8392a566722e9bf2cd6a94f0b69ebf811b13 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_26.txt @@ -0,0 +1,13 @@ +Memory utilization at vanilla training +Let's use the [Trainer] and train the model without using any GPU performance optimization techniques and a batch size of 4: + +from transformers import TrainingArguments, Trainer, logging +logging.set_verbosity_error() +training_args = TrainingArguments(per_device_train_batch_size=4, **default_args) +trainer = Trainer(model=model, args=training_args, train_dataset=ds) +result = trainer.train() +print_summary(result) + +Time: 57.82 +Samples/second: 8.86 +GPU memory occupied: 14949 MB. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_27.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c66f3db675e6dbec05506914a49b1761cb1c2af --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_27.txt @@ -0,0 +1 @@ +We see that already a relatively small batch size almost fills up our GPU's entire memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_28.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef985dd10db757b40bcb13f30a8a2b83a1be37db --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_28.txt @@ -0,0 +1,2 @@ +However, a larger batch size +can often result in faster model convergence or better end performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_29.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bd9d52d118ba17ae225cb0d6d6e8145773fac58 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_29.txt @@ -0,0 +1,2 @@ +So ideally we want to tune the batch size to our +model's needs and not to the GPU limitations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_30.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c6d1e3d36fb396f497ba73c503c5151602496a9 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_30.txt @@ -0,0 +1 @@ +What's interesting is that we use much more memory than the size of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_31.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9e6fa053d57ddee95f206f772e9f1edf65ea112 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_31.txt @@ -0,0 +1 @@ +To understand a bit better why this is the case let's have a look at a model's operations and memory needs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_32.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fd94ff3b60c962f86a3d0285825e4c491584b0b --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_32.txt @@ -0,0 +1,2 @@ +Anatomy of Model's Operations +Transformers architecture includes 3 main groups of operations grouped below by compute-intensity. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_33.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..95f5d5ed767c3711ee845f8686f44c94bfa92a53 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_33.txt @@ -0,0 +1,2 @@ +Tensor Contractions +Linear layers and components of Multi-Head Attention all do batched matrix-matrix multiplications. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_34.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a3d43ede079a1061bfe249d88de35b92085333b --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_34.txt @@ -0,0 +1 @@ +These operations are the most compute-intensive part of training a transformer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_35.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..65173c679a12a25470befdc9ebfa9b6d07b5c545 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_35.txt @@ -0,0 +1,2 @@ +Statistical Normalizations +Softmax and layer normalization are less compute-intensive than tensor contractions, and involve one or more reduction operations, the result of which is then applied via a map. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_36.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bdb06bdf55f244ed624551783897d2b81c9e47c --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_36.txt @@ -0,0 +1,2 @@ +Element-wise Operators +These are the remaining operators: biases, dropout, activations, and residual connections. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_37.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..278d699c48fe3bfa380ecca066198dfc7e8b5f78 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_37.txt @@ -0,0 +1 @@ +These are the least compute-intensive operations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_38.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c82b51499194063d1d88f8540dccfb7abd27426 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_38.txt @@ -0,0 +1 @@ +This knowledge can be helpful to know when analyzing performance bottlenecks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_39.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c16d95ffcbda84f46df7bc9ff0ddf55452b4d20 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_39.txt @@ -0,0 +1,3 @@ +This summary is derived from Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020 +Anatomy of Model's Memory +We've seen that training the model uses much more memory than just putting the model on the GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_40.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..52bf1e30ff6880cd9366f8f61b63f8500966f599 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_40.txt @@ -0,0 +1,2 @@ +This is because there +are many components during training that use GPU memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_41.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a3c9b181c347b1602e23a3b4da9deb8b0130081 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_41.txt @@ -0,0 +1,10 @@ +The components on GPU memory are the following: + +model weights +optimizer states +gradients +forward activations saved for gradient computation +temporary buffers +functionality-specific memory + +A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_42.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..af6f511beeb8e39c59d9e667b136710a53ed487f --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_42.txt @@ -0,0 +1,2 @@ +For +inference there are no optimizer states and gradients, so we can subtract those. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_43.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd972b59b2f14d2f84301cd9da08317288fafc86 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_43.txt @@ -0,0 +1,2 @@ +And thus we end up with 6 bytes per +model parameter for mixed precision inference, plus activation memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_44.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..c04a7cd9af2971e47cbb4d7b0aee3f1a69458ff3 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_44.txt @@ -0,0 +1 @@ +Let's look at the details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_45.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..0898f5cb6bcf021e92411ce8392bbb4b21352200 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_45.txt @@ -0,0 +1,18 @@ +Model Weights: + +4 bytes * number of parameters for fp32 training +6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory) + +Optimizer States: + +8 bytes * number of parameters for normal AdamW (maintains 2 states) +2 bytes * number of parameters for 8-bit AdamW optimizers like bitsandbytes +4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state) + +Gradients + +4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32) + +Forward Activations + +size depends on many factors, the key ones being sequence length, hidden size and batch size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_46.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba29679ecc202b2861d32b2c8aff765820678e5d --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_46.txt @@ -0,0 +1,2 @@ +There are the input and output that are being passed and returned by the forward and the backward functions and the +forward activations saved for gradient computation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_47.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..32945e8f7e9934bfcf39ecf0978908568e45525c --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_47.txt @@ -0,0 +1,3 @@ +Temporary Memory +Additionally, there are all kinds of temporary variables which get released once the calculation is done, but in the +moment these could require additional memory and could push to OOM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_48.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..ade92f48fc14f683ad60411c3c6b0b8ca817b532 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_48.txt @@ -0,0 +1,2 @@ +Therefore, when coding it's crucial to think +strategically about such temporary variables and sometimes to explicitly free those as soon as they are no longer needed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_49.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3251068f8c7b6e695905f914dac7f6582f73942 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_49.txt @@ -0,0 +1,2 @@ +Functionality-specific memory +Then, your software could have special memory needs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_50.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..60ab0c7dd9a3044e58951a1f94c778ca7068524b --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_50.txt @@ -0,0 +1,2 @@ +For example, when generating text using beam search, the software +needs to maintain multiple copies of inputs and outputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_51.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..41ded1e29b02bd5b051a4ac4478db01dcc73af77 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_51.txt @@ -0,0 +1,3 @@ +forward vs backward Execution Speed +For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates +into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_52.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f8d8ac6af6203ac6f86e75f739b51da2843b7c3 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_52.txt @@ -0,0 +1,3 @@ +Activations are usually +bandwidth-limited, and it’s typical for an activation to have to read more data in the backward than in the forward +(e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_53.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..54eda3afa5b8986deffb2d4e479a8196c91688c5 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_53.txt @@ -0,0 +1,2 @@ +activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, +and writes once, gradInput). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_54.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4a5f482ce29e4cb61c4ac96f6aebae9a0b9c8d4 --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_54.txt @@ -0,0 +1 @@ +As you can see, there are potentially a few places where we could save GPU memory or speed up operations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_memory_anatomy/chunk_55.txt b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6c24b44e88d5c28c612527cadc39ca1c8abf2dd --- /dev/null +++ b/chunked/content_aware_chunking/_model_memory_anatomy/chunk_55.txt @@ -0,0 +1,3 @@ +Now that you understand what affects GPU utilization and computation speed, refer to +the Methods and tools for efficient training on a single GPU documentation page to learn about +performance optimization techniques. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_18.txt b/chunked/content_aware_chunking/_model_sharing/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..63519003ba04e6b37e0b064e5effa231917e3d78 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_18.txt @@ -0,0 +1 @@ +While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_19.txt b/chunked/content_aware_chunking/_model_sharing/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a08dcbf1731117afd856bbb0ec2666cb6eeb7500 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_19.txt @@ -0,0 +1 @@ +Converting a checkpoint for another framework is easy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_20.txt b/chunked/content_aware_chunking/_model_sharing/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..94a65d8a1cb4243a0820f63fdac9e7ff8f506e08 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_20.txt @@ -0,0 +1 @@ +Make sure you have PyTorch and TensorFlow installed (see here for installation instructions), and then find the specific model for your task in the other framework. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_21.txt b/chunked/content_aware_chunking/_model_sharing/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8de194264fc7cda0942060e975a0a34b90c57eee --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_21.txt @@ -0,0 +1,24 @@ +Specify from_tf=True to convert a checkpoint from TensorFlow to PyTorch: + +pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True) +pt_model.save_pretrained("path/to/awesome-name-you-picked") +`` + + +Specifyfrom_pt=True` to convert a checkpoint from PyTorch to TensorFlow: + +tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True) + +Then you can save your new TensorFlow model with its new checkpoint: + +tf_model.save_pretrained("path/to/awesome-name-you-picked") + +If a model is available in Flax, you can also convert a checkpoint from PyTorch to Flax: + +flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( + "path/to/awesome-name-you-picked", from_pt=True + ) + +Push a model during training + +Sharing a model to the Hub is as simple as adding an extra parameter or callback. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_22.txt b/chunked/content_aware_chunking/_model_sharing/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..574287bc2e4444b554e8cd1ee39ff7ed35a4f0e8 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_22.txt @@ -0,0 +1 @@ +Remember from the fine-tuning tutorial, the [TrainingArguments] class is where you specify hyperparameters and additional training options. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_23.txt b/chunked/content_aware_chunking/_model_sharing/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa433370587d7bbff1aa7001a30cc8d874f15bf3 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_23.txt @@ -0,0 +1 @@ +One of these training options includes the ability to push a model directly to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_24.txt b/chunked/content_aware_chunking/_model_sharing/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..67cd43d6f6be6eb3ac1c8cfaa3acd917d7f611d5 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_24.txt @@ -0,0 +1,15 @@ +Set push_to_hub=True in your [TrainingArguments]: + +training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True) + +Pass your training arguments as usual to [Trainer]: + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=small_train_dataset, + eval_dataset=small_eval_dataset, + compute_metrics=compute_metrics, + ) + +After you fine-tune your model, call [~transformers.Trainer.push_to_hub] on [Trainer] to push the trained model to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_25.txt b/chunked/content_aware_chunking/_model_sharing/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..08587548cfb91a8bd582cecdf01fa693c824cd08 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_25.txt @@ -0,0 +1 @@ +🤗 Transformers will even automatically add training hyperparameters, training results and framework versions to your model card! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_26.txt b/chunked/content_aware_chunking/_model_sharing/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ec7cb2e5f9f1420042f3ca5258d7538cef4b03c --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_26.txt @@ -0,0 +1,5 @@ +trainer.push_to_hub() +`` + + +Share a model to the Hub with [PushToHubCallback]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_27.txt b/chunked/content_aware_chunking/_model_sharing/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..09047c0ad80ee628a802ec2277b1f20fae4e1eda --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_27.txt @@ -0,0 +1,3 @@ +In the [PushToHubCallback`] function, add: + +An output directory for your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_28.txt b/chunked/content_aware_chunking/_model_sharing/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c1c06219a4ae8becf8ce2d40ab03a3ad41ce106 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_28.txt @@ -0,0 +1 @@ +A tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_29.txt b/chunked/content_aware_chunking/_model_sharing/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e82facff58572df38d0cec1acb635e3e6a77f0b --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_29.txt @@ -0,0 +1 @@ +The hub_model_id, which is your Hub username and model name. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_30.txt b/chunked/content_aware_chunking/_model_sharing/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e49bd33dba0b9db52cf3ce0eed1b8594a727ccf --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_30.txt @@ -0,0 +1,11 @@ +from transformers import PushToHubCallback +push_to_hub_callback = PushToHubCallback( + output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model" + ) + +Add the callback to fit, and 🤗 Transformers will push the trained model to the Hub: + +model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) + +Use the push_to_hub function +You can also call push_to_hub directly on your model to upload it to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_31.txt b/chunked/content_aware_chunking/_model_sharing/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..a509e3afc22487844121d4e52643ae23223c9fa0 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_31.txt @@ -0,0 +1,5 @@ +Specify your model name in push_to_hub: + +pt_model.push_to_hub("my-awesome-model") + +This creates a repository under your username with the model name my-awesome-model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_32.txt b/chunked/content_aware_chunking/_model_sharing/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6d27f6dbe8f4197bdffca5c12bc00f4d5cbe45d --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_32.txt @@ -0,0 +1,10 @@ +Users can now load your model with the from_pretrained function: + +from transformers import AutoModel +model = AutoModel.from_pretrained("your_username/my-awesome-model") + +If you belong to an organization and want to push your model under the organization name instead, just add it to the repo_id: + +pt_model.push_to_hub("my-awesome-org/my-awesome-model") + +The push_to_hub function can also be used to add other files to a model repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_33.txt b/chunked/content_aware_chunking/_model_sharing/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..465c30696539c225e95dbf3052e0341426064cef --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_33.txt @@ -0,0 +1,9 @@ +For example, add a tokenizer to a model repository: + +tokenizer.push_to_hub("my-awesome-model") + +Or perhaps you'd like to add the TensorFlow version of your fine-tuned PyTorch model: + +tf_model.push_to_hub("my-awesome-model") + +Now when you navigate to your Hugging Face profile, you should see your newly created model repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_34.txt b/chunked/content_aware_chunking/_model_sharing/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..b74c834800e36ef3440cfaf1466a97cf1e296db6 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_34.txt @@ -0,0 +1 @@ +Clicking on the Files tab will display all the files you've uploaded to the repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_35.txt b/chunked/content_aware_chunking/_model_sharing/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef32986c92e3863f83eea9cbc68cfacef113e497 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_35.txt @@ -0,0 +1 @@ +For more details on how to create and upload files to a repository, refer to the Hub documentation here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_36.txt b/chunked/content_aware_chunking/_model_sharing/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..305fdecf1bbe4ea00bdb516c9fac5a4631b1ec73 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_36.txt @@ -0,0 +1,2 @@ +Upload with the web interface +Users who prefer a no-code approach are able to upload a model through the Hub's web interface. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_37.txt b/chunked/content_aware_chunking/_model_sharing/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff6c37af41ed150d92d9542ea5554b1d8a66d918 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_37.txt @@ -0,0 +1,5 @@ +Visit huggingface.co/new to create a new repository: + +From here, add some information about your model: + +Select the owner of the repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_38.txt b/chunked/content_aware_chunking/_model_sharing/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec3a5916735cdb3f94facab924201b5cc3c9f29d --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_38.txt @@ -0,0 +1 @@ +This can be yourself or any of the organizations you belong to. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_39.txt b/chunked/content_aware_chunking/_model_sharing/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..858ef94b3e83fa88e39495c8c9eed1d665b3e8f2 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_39.txt @@ -0,0 +1 @@ +Pick a name for your model, which will also be the repository name. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_40.txt b/chunked/content_aware_chunking/_model_sharing/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..265a15d8d1b143e111df06cc88bfaec7025c7277 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_40.txt @@ -0,0 +1 @@ +Choose whether your model is public or private. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_41.txt b/chunked/content_aware_chunking/_model_sharing/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef817da96208985d573ad389821568eb151f37a8 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_41.txt @@ -0,0 +1 @@ +Specify the license usage for your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_42.txt b/chunked/content_aware_chunking/_model_sharing/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9fa529bdc042d5faf1b4eb621160ece05bc9978 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_42.txt @@ -0,0 +1 @@ +Now click on the Files tab and click on the Add file button to upload a new file to your repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_43.txt b/chunked/content_aware_chunking/_model_sharing/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..566e13a494ba1e016f6e4448adc4b4655df6c29f --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_43.txt @@ -0,0 +1 @@ +Then drag-and-drop a file to upload and add a commit message. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_44.txt b/chunked/content_aware_chunking/_model_sharing/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..89a536ec3567b43034fef9897d0e023d519809ef --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_44.txt @@ -0,0 +1,2 @@ +Add a model card +To make sure users understand your model's capabilities, limitations, potential biases and ethical considerations, please add a model card to your repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_45.txt b/chunked/content_aware_chunking/_model_sharing/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fb1deb55806e2f4ddce04ebf7b466eead998d04 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_45.txt @@ -0,0 +1 @@ +The model card is defined in the README.md file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_46.txt b/chunked/content_aware_chunking/_model_sharing/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..c64f26811a7cbfe6af6df6c57a00b7d721ff9af7 --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_46.txt @@ -0,0 +1,3 @@ +You can add a model card by: + +Manually creating and uploading a README.md file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_47.txt b/chunked/content_aware_chunking/_model_sharing/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..d33d60388b38bd0bc6e53351c1fbbef5a247950c --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_47.txt @@ -0,0 +1 @@ +Clicking on the Edit model card button in your model repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_48.txt b/chunked/content_aware_chunking/_model_sharing/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..73d606a14f20f7346026655a0de3a294e823de5e --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_48.txt @@ -0,0 +1 @@ +Take a look at the DistilBert model card for a good example of the type of information a model card should include. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_sharing/chunk_49.txt b/chunked/content_aware_chunking/_model_sharing/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..a527cf8c814642e92ecff1b6c49908300fbc9d8c --- /dev/null +++ b/chunked/content_aware_chunking/_model_sharing/chunk_49.txt @@ -0,0 +1 @@ +For more details about other options you can control in the README.md file such as a model's carbon footprint or widget examples, refer to the documentation here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_100.txt b/chunked/content_aware_chunking/_model_summary/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..cced6651a1ecc9da5603bc145dfe1e46d5ab6d5a --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_100.txt @@ -0,0 +1 @@ +Donut is pretrained to read text by predicting the next word based on the image and text annotations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_101.txt b/chunked/content_aware_chunking/_model_summary/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4a07389550a2b7c19e5dd81c62a23ff1d016626 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_101.txt @@ -0,0 +1 @@ +The decoder generates a token sequence given a prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_102.txt b/chunked/content_aware_chunking/_model_summary/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e9dfbeb582fceb458cf221556eb086c89875254 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_102.txt @@ -0,0 +1 @@ +The prompt is represented by a special token for each downstream task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_103.txt b/chunked/content_aware_chunking/_model_summary/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..b66cd26345532bc622906b276a08bd1f37be4a01 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_103.txt @@ -0,0 +1 @@ +For example, document parsing has a special parsing token that is combined with the encoder hidden states to parse the document into a structured output format (JSON). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_104.txt b/chunked/content_aware_chunking/_model_summary/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..f44688c7da6145d7a1ce25efbba20a973d5cc700 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_104.txt @@ -0,0 +1,4 @@ +Reinforcement learning + +Decoder[[rl-decoder]] +The Decision and Trajectory Transformer casts the state, action, and reward as a sequence modeling problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_105.txt b/chunked/content_aware_chunking/_model_summary/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..cefcb916b6ce3f04fafc7b254346caa30b2d16dc --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_105.txt @@ -0,0 +1 @@ +The Decision Transformer generates a series of actions that lead to a future desired return based on returns-to-go, past states, and actions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_106.txt b/chunked/content_aware_chunking/_model_summary/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..420a23141bcb5f1a9b88e4451f028baa43882cb4 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_106.txt @@ -0,0 +1 @@ +For the last K timesteps, each of the three modalities are converted into token embeddings and processed by a GPT-like model to predict a future action token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_107.txt b/chunked/content_aware_chunking/_model_summary/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..c594fce5c82553fcca85b46a4fc16c0bf34341bf --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_107.txt @@ -0,0 +1 @@ +Trajectory Transformer also tokenizes the states, actions, and rewards and processes them with a GPT architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_108.txt b/chunked/content_aware_chunking/_model_summary/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfda2b0001a4fc782086f1118f924907438bc377 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_108.txt @@ -0,0 +1 @@ +Unlike the Decision Transformer, which is focused on reward conditioning, the Trajectory Transformer generates future actions with beam search. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_34.txt b/chunked/content_aware_chunking/_model_summary/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..312c982b10058f7b79a6e0a94900334f5a4bedfd --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_34.txt @@ -0,0 +1 @@ +DETR has a pretrained backbone, but it also uses the complete Transformer encoder-decoder architecture for object detection. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_35.txt b/chunked/content_aware_chunking/_model_summary/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a726408f35fe5368f3858d65f767db9e82d520b --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_35.txt @@ -0,0 +1 @@ +The encoder learns image representations and combines them with object queries (each object query is a learned embedding that focuses on a region or object in an image) in the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_36.txt b/chunked/content_aware_chunking/_model_summary/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..551f924946f76454a5a975d8302cb17af66d386d --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_36.txt @@ -0,0 +1 @@ +DETR predicts the bounding box coordinates and class label for each object query. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_37.txt b/chunked/content_aware_chunking/_model_summary/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a087dbbe92ffb808c37080556ec441e0c3282a0 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_37.txt @@ -0,0 +1,4 @@ +Natural language processing + +Encoder[[nlp-encoder]] +BERT is an encoder-only Transformer that randomly masks certain tokens in the input to avoid seeing other tokens, which would allow it to "cheat". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_38.txt b/chunked/content_aware_chunking/_model_summary/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..042853f947770dc47c5f02ff554e13be7157dd4b --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_38.txt @@ -0,0 +1 @@ +The pretraining objective is to predict the masked token based on the context. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_39.txt b/chunked/content_aware_chunking/_model_summary/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..26a0f0955fe8b1f87a1659d6ef087d06129c6941 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_39.txt @@ -0,0 +1 @@ +This allows BERT to fully use the left and right contexts to help it learn a deeper and richer representation of the inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_40.txt b/chunked/content_aware_chunking/_model_summary/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..048ece9e9c736a98c0823d7c4c726acedea94ad2 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_40.txt @@ -0,0 +1 @@ +However, there was still room for improvement in BERT's pretraining strategy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_41.txt b/chunked/content_aware_chunking/_model_summary/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea15c4a27c63b879c029fefe0f85fd7fdba28d77 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_41.txt @@ -0,0 +1 @@ +RoBERTa improved upon this by introducing a new pretraining recipe that includes training for longer and on larger batches, randomly masking tokens at each epoch instead of just once during preprocessing, and removing the next-sentence prediction objective. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_42.txt b/chunked/content_aware_chunking/_model_summary/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9d1dc15faf2a4f5da81ff584bb6054a0c0e5c81 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_42.txt @@ -0,0 +1 @@ +The dominant strategy to improve performance is to increase the model size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_43.txt b/chunked/content_aware_chunking/_model_summary/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec218fef25a189f84e53c2f00e531d8e5b1719ad --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_43.txt @@ -0,0 +1 @@ +But training large models is computationally expensive. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_44.txt b/chunked/content_aware_chunking/_model_summary/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5aa0876384e28f6d980df29dffa4f0ee8a01eed --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_44.txt @@ -0,0 +1 @@ +One way to reduce computational costs is using a smaller model like DistilBERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_45.txt b/chunked/content_aware_chunking/_model_summary/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..12f2515bde2570301fc8c1e8e43ca4ab535eb089 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_45.txt @@ -0,0 +1 @@ +DistilBERT uses knowledge distillation - a compression technique - to create a smaller version of BERT while keeping nearly all of its language understanding capabilities. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_46.txt b/chunked/content_aware_chunking/_model_summary/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..b23afee9752e61bf1bfdf8ad742aafab6f07c1ee --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_46.txt @@ -0,0 +1 @@ +However, most Transformer models continued to trend towards more parameters, leading to new models focused on improving training efficiency. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_47.txt b/chunked/content_aware_chunking/_model_summary/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fdad700b0cd037f920c4f8826878b97d72c82b4 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_47.txt @@ -0,0 +1 @@ +ALBERT reduces memory consumption by lowering the number of parameters in two ways: separating the larger vocabulary embedding into two smaller matrices and allowing layers to share parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_48.txt b/chunked/content_aware_chunking/_model_summary/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef16c4f38ac6987db4800bf209cc1250a2a9454d --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_48.txt @@ -0,0 +1 @@ +DeBERTa added a disentangled attention mechanism where the word and its position are separately encoded in two vectors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_49.txt b/chunked/content_aware_chunking/_model_summary/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6b755b5e8beec14577bade47181e638a50db881 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_49.txt @@ -0,0 +1 @@ +The attention is computed from these separate vectors instead of a single vector containing the word and position embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_50.txt b/chunked/content_aware_chunking/_model_summary/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc3abf6520898213d15cf36fc5dbeb1e56dcfb01 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_50.txt @@ -0,0 +1 @@ +Longformer also focused on making attention more efficient, especially for processing documents with longer sequence lengths. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_51.txt b/chunked/content_aware_chunking/_model_summary/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba6d4a69d8c7af6234153b22eba5333b2f2c4c7a --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_51.txt @@ -0,0 +1 @@ +It uses a combination of local windowed attention (attention only calculated from fixed window size around each token) and global attention (only for specific task tokens like [CLS] for classification) to create a sparse attention matrix instead of a full attention matrix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_52.txt b/chunked/content_aware_chunking/_model_summary/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..e81af808a01aaa70320bea52f5e0e01e2a02d1ca --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_52.txt @@ -0,0 +1,2 @@ +Decoder[[nlp-decoder]] +GPT-2 is a decoder-only Transformer that predicts the next word in the sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_53.txt b/chunked/content_aware_chunking/_model_summary/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6aebba67b3d8fe9f209190704466221e477deca --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_53.txt @@ -0,0 +1 @@ +It masks tokens to the right so the model can't "cheat" by looking ahead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_54.txt b/chunked/content_aware_chunking/_model_summary/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0b3214e1ad00b9c3de0f772452d23ed838f7f9d --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_54.txt @@ -0,0 +1 @@ +By pretraining on a massive body of text, GPT-2 became really good at generating text, even if the text is only sometimes accurate or true. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_55.txt b/chunked/content_aware_chunking/_model_summary/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..c83a758a7e1299f6d6611e49d1af9444df60c3fa --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_55.txt @@ -0,0 +1 @@ +But GPT-2 lacked the bidirectional context from BERT's pretraining, which made it unsuitable for certain tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_56.txt b/chunked/content_aware_chunking/_model_summary/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..427c83f7d2358f4e2bb4503b891e77821b08470b --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_56.txt @@ -0,0 +1 @@ +XLNET combines the best of both BERT and GPT-2's pretraining objectives by using a permutation language modeling objective (PLM) that allows it to learn bidirectionally. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_57.txt b/chunked/content_aware_chunking/_model_summary/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e7b41fb03ce21ddd96ab1e93f3a6675786bb119 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_57.txt @@ -0,0 +1 @@ +After GPT-2, language models grew even bigger and are now known as large language models (LLMs). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_58.txt b/chunked/content_aware_chunking/_model_summary/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..5abe4601f5ac5f793d268fbc0d1443523a76be33 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_58.txt @@ -0,0 +1 @@ +LLMs demonstrate few- or even zero-shot learning if pretrained on a large enough dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_59.txt b/chunked/content_aware_chunking/_model_summary/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..79a97edcdf3a5a3d6d15412fa0fda392b48adcf5 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_59.txt @@ -0,0 +1 @@ +GPT-J is an LLM with 6B parameters and trained on 400B tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_60.txt b/chunked/content_aware_chunking/_model_summary/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..93c5035d5ce79a3b7a5453d32012306b67d1f27b --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_60.txt @@ -0,0 +1 @@ +GPT-J was followed by OPT, a family of decoder-only models, the largest of which is 175B and trained on 180B tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_61.txt b/chunked/content_aware_chunking/_model_summary/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdfe060c0be6d01fe2eb9ec9423cdaba39883d17 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_61.txt @@ -0,0 +1 @@ +BLOOM was released around the same time, and the largest model in the family has 176B parameters and is trained on 366B tokens in 46 languages and 13 programming languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_62.txt b/chunked/content_aware_chunking/_model_summary/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0f1da22d07baf159f3b84cf117493052116b0fd --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_62.txt @@ -0,0 +1,2 @@ +Encoder-decoder[[nlp-encoder-decoder]] +BART keeps the original Transformer architecture, but it modifies the pretraining objective with text infilling corruption, where some text spans are replaced with a single mask token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_63.txt b/chunked/content_aware_chunking/_model_summary/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..b805954fa014a3d1731324b55826761738f2649b --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_63.txt @@ -0,0 +1 @@ +The decoder predicts the uncorrupted tokens (future tokens are masked) and uses the encoder's hidden states to help it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_64.txt b/chunked/content_aware_chunking/_model_summary/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..cefd056308d5055de68b8b78a4b117b3d0b649b9 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_64.txt @@ -0,0 +1 @@ +Pegasus is similar to BART, but Pegasus masks entire sentences instead of text spans. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_65.txt b/chunked/content_aware_chunking/_model_summary/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8310caab4c94e102b7616fe6558242bde16d6a9 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_65.txt @@ -0,0 +1 @@ +In addition to masked language modeling, Pegasus is pretrained by gap sentence generation (GSG). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_66.txt b/chunked/content_aware_chunking/_model_summary/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..2df7e5e1a9ff5102b44c95ec4ba4c807e9c24bcc --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_66.txt @@ -0,0 +1 @@ +The GSG objective masks whole sentences important to a document, replacing them with a mask token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_67.txt b/chunked/content_aware_chunking/_model_summary/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..941885d0ac635f3aa3c55fdab34fa353237fa9ba --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_67.txt @@ -0,0 +1 @@ +The decoder must generate the output from the remaining sentences. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_68.txt b/chunked/content_aware_chunking/_model_summary/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..54e7dfc1e72547dbf31e497b9aca3e19adcfef2a --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_68.txt @@ -0,0 +1 @@ +T5 is a more unique model that casts all NLP tasks into a text-to-text problem using specific prefixes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_69.txt b/chunked/content_aware_chunking/_model_summary/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a875af3d3b10e4c67948d2771287512764040e0 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_69.txt @@ -0,0 +1 @@ +For example, the prefix Summarize: indicates a summarization task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_70.txt b/chunked/content_aware_chunking/_model_summary/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6ed21d4e103efa39d922d8d3ab0424ecb2352c0 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_70.txt @@ -0,0 +1 @@ +T5 is pretrained by supervised (GLUE and SuperGLUE) training and self-supervised training (randomly sample and drop out 15% of tokens). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_71.txt b/chunked/content_aware_chunking/_model_summary/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..c51d5db86cfa0e5b8205aa569868b990f566a6e7 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_71.txt @@ -0,0 +1,4 @@ +Audio + +Encoder[[audio-encoder]] +Wav2Vec2 uses a Transformer encoder to learn speech representations directly from raw audio waveforms. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_72.txt b/chunked/content_aware_chunking/_model_summary/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..3acce02fb2baaaba20feed66c0242df18f48775b --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_72.txt @@ -0,0 +1 @@ +It is pretrained with a contrastive task to determine the true speech representation from a set of false ones. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_73.txt b/chunked/content_aware_chunking/_model_summary/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..ceb5be73c9770ba50a096ba1cb9fc59a4a4e3eeb --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_73.txt @@ -0,0 +1 @@ +HuBERT is similar to Wav2Vec2 but has a different training process. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_74.txt b/chunked/content_aware_chunking/_model_summary/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..81e0fc875be3def47aad40013ff3701d96203c49 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_74.txt @@ -0,0 +1 @@ +Target labels are created by a clustering step in which segments of similar audio are assigned to a cluster which becomes a hidden unit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_75.txt b/chunked/content_aware_chunking/_model_summary/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b18ff4466a4d4eb51036482dbcc1ebc733a2095 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_75.txt @@ -0,0 +1 @@ +The hidden unit is mapped to an embedding to make a prediction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_76.txt b/chunked/content_aware_chunking/_model_summary/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..509001635e7695ac30c75adde603537b1158ad05 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_76.txt @@ -0,0 +1,2 @@ +Encoder-decoder[[audio-encoder-decoder]] +Speech2Text is a speech model designed for automatic speech recognition (ASR) and speech translation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_77.txt b/chunked/content_aware_chunking/_model_summary/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..014b31cf30b5d3f1f3fdd122f17920d681cf8fcc --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_77.txt @@ -0,0 +1 @@ +The model accepts log mel-filter bank features extracted from the audio waveform and pretrained autoregressively to generate a transcript or translation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_78.txt b/chunked/content_aware_chunking/_model_summary/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf523a00f70e3a4fabc2af86afc8c069345adbc7 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_78.txt @@ -0,0 +1 @@ +Whisper is also an ASR model, but unlike many other speech models, it is pretrained on a massive amount of ✨ labeled ✨ audio transcription data for zero-shot performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_79.txt b/chunked/content_aware_chunking/_model_summary/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..845260540f5c8deb2a8b0009bf7eba0805e73d3c --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_79.txt @@ -0,0 +1 @@ +A large chunk of the dataset also contains non-English languages, meaning Whisper can also be used for low-resource languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_80.txt b/chunked/content_aware_chunking/_model_summary/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..713ecda414ae133232319a733ff0981b9eb7a89d --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_80.txt @@ -0,0 +1 @@ +Structurally, Whisper is similar to Speech2Text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_81.txt b/chunked/content_aware_chunking/_model_summary/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb41278e6531360211377ba1efe1233b9ba31672 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_81.txt @@ -0,0 +1 @@ +The audio signal is converted to a log-mel spectrogram encoded by the encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_82.txt b/chunked/content_aware_chunking/_model_summary/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd4883848164661ed2783788b6162a7c972ce3c2 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_82.txt @@ -0,0 +1 @@ +The decoder generates the transcript autoregressively from the encoder's hidden states and the previous tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_83.txt b/chunked/content_aware_chunking/_model_summary/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..43bcb8cf0ee30293fab95f88217cd8f6c45fee79 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_83.txt @@ -0,0 +1,4 @@ +Multimodal + +Encoder[[mm-encoder]] +VisualBERT is a multimodal model for vision-language tasks released shortly after BERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_84.txt b/chunked/content_aware_chunking/_model_summary/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..21d9f2d708d0d299751160dc3bf4c9a2c7a191dc --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_84.txt @@ -0,0 +1 @@ +It combines BERT and a pretrained object detection system to extract image features into visual embeddings, passed alongside text embeddings to BERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_85.txt b/chunked/content_aware_chunking/_model_summary/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b78d5ae8b3dbd131d2166330e0ed11de3ecf078 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_85.txt @@ -0,0 +1 @@ +VisualBERT predicts the masked text based on the unmasked text and the visual embeddings, and it also has to predict whether the text is aligned with the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_86.txt b/chunked/content_aware_chunking/_model_summary/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6f497a1f496d1bc335fa7c5e553f9f4ca2c8637 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_86.txt @@ -0,0 +1 @@ +When ViT was released, ViLT adopted ViT in its architecture because it was easier to get the image embeddings this way. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_87.txt b/chunked/content_aware_chunking/_model_summary/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..65847164ba0e6bab5fe8afb8ddc63229649ece5a --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_87.txt @@ -0,0 +1 @@ +The image embeddings are jointly processed with the text embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_88.txt b/chunked/content_aware_chunking/_model_summary/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa4c02db001428cea0585194764a31bd2c302c54 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_88.txt @@ -0,0 +1 @@ +From there, ViLT is pretrained by image text matching, masked language modeling, and whole word masking. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_89.txt b/chunked/content_aware_chunking/_model_summary/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa2604498c500fb02da75f184a7135ef60b18500 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_89.txt @@ -0,0 +1 @@ +CLIP takes a different approach and makes a pair prediction of (image, text) . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_90.txt b/chunked/content_aware_chunking/_model_summary/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfe73a219a087f48f91f722dff98091738b1c5e8 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_90.txt @@ -0,0 +1 @@ +An image encoder (ViT) and a text encoder (Transformer) are jointly trained on a 400 million (image, text) pair dataset to maximize the similarity between the image and text embeddings of the (image, text) pairs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_91.txt b/chunked/content_aware_chunking/_model_summary/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c6cf84055e345d8bb98ea32b7a242cd4cc7cd70 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_91.txt @@ -0,0 +1 @@ +After pretraining, you can use natural language to instruct CLIP to predict the text given an image or vice versa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_92.txt b/chunked/content_aware_chunking/_model_summary/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..d051ef31bc384e6530970c519880ca5b5ac32a1a --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_92.txt @@ -0,0 +1 @@ +OWL-ViT builds on top of CLIP by using it as its backbone for zero-shot object detection. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_93.txt b/chunked/content_aware_chunking/_model_summary/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..92abd85f2a18823d36656f7ee76c80eaf608e6b3 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_93.txt @@ -0,0 +1 @@ +After pretraining, an object detection head is added to make a set prediction over the (class, bounding box) pairs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_94.txt b/chunked/content_aware_chunking/_model_summary/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cb911448b344d9e64c2e9605fd0739bca191c65 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_94.txt @@ -0,0 +1,2 @@ +Encoder-decoder[[mm-encoder-decoder]] +Optical character recognition (OCR) is a long-standing text recognition task that typically involves several components to understand the image and generate the text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_95.txt b/chunked/content_aware_chunking/_model_summary/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..e25041cb47ec5357228565135a0ac8810ffffe2c --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_95.txt @@ -0,0 +1 @@ +TrOCR simplifies the process using an end-to-end Transformer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_96.txt b/chunked/content_aware_chunking/_model_summary/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0a2273c94123fc68ab3a9ab0827638460d14636 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_96.txt @@ -0,0 +1 @@ +The encoder is a ViT-style model for image understanding and processes the image as fixed-size patches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_97.txt b/chunked/content_aware_chunking/_model_summary/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..96bb682e6984f07638f3720aec41bd13d1030e94 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_97.txt @@ -0,0 +1 @@ +The decoder accepts the encoder's hidden states and autoregressively generates text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_98.txt b/chunked/content_aware_chunking/_model_summary/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..eaf0ba3284797af3cc1e6ab564dc671802fe433d --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_98.txt @@ -0,0 +1 @@ +Donut is a more general visual document understanding model that doesn't rely on OCR-based approaches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_model_summary/chunk_99.txt b/chunked/content_aware_chunking/_model_summary/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..76212bd1b7a5c97c2240102c8564c0a40b7825e7 --- /dev/null +++ b/chunked/content_aware_chunking/_model_summary/chunk_99.txt @@ -0,0 +1 @@ +It uses a Swin Transformer as the encoder and multilingual BART as the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_11.txt b/chunked/content_aware_chunking/_multilingual/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa22000157aa3c9bc1375ea6ed0818f5bc8c13d6 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_11.txt @@ -0,0 +1,10 @@ +language_id = tokenizer.lang2id["en"] # 0 +langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, , 0]) +We reshape it to be of size (batch_size, sequence_length) +langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) + +Now you can pass the input_ids and language embedding to the model: + +outputs = model(input_ids, langs=langs) + +The run_generation.py script can generate text with language embeddings using the xlm-clm checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_12.txt b/chunked/content_aware_chunking/_multilingual/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..36efcb8ad1abf3e95e81fb4ea61726da5e7c9388 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_12.txt @@ -0,0 +1,7 @@ +XLM without language embeddings +The following XLM models do not require language embeddings during inference: + +FacebookAI/xlm-mlm-17-1280 (Masked language modeling, 17 languages) +FacebookAI/xlm-mlm-100-1280 (Masked language modeling, 100 languages) + +These models are used for generic sentence representations, unlike the previous XLM checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_13.txt b/chunked/content_aware_chunking/_multilingual/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5cfcd23e0cdd59118d8b13f903a605140b850d7 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_13.txt @@ -0,0 +1,7 @@ +BERT +The following BERT models can be used for multilingual tasks: + +google-bert/bert-base-multilingual-uncased (Masked language modeling + Next sentence prediction, 102 languages) +google-bert/bert-base-multilingual-cased (Masked language modeling + Next sentence prediction, 104 languages) + +These models do not require language embeddings during inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_14.txt b/chunked/content_aware_chunking/_multilingual/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..634b63398c347de9be9180aa2a9c424ec36ef9c5 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_14.txt @@ -0,0 +1,2 @@ +They should identify the language from the +context and infer accordingly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_15.txt b/chunked/content_aware_chunking/_multilingual/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b821fe00c713a25e48621f879190f616618610a8 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_15.txt @@ -0,0 +1,7 @@ +XLM-RoBERTa +The following XLM-RoBERTa models can be used for multilingual tasks: + +FacebookAI/xlm-roberta-base (Masked language modeling, 100 languages) +FacebookAI/xlm-roberta-large (Masked language modeling, 100 languages) + +XLM-RoBERTa was trained on 2.5TB of newly created and cleaned CommonCrawl data in 100 languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_16.txt b/chunked/content_aware_chunking/_multilingual/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..75391905722c4495a708ca805cec72273a890858 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_16.txt @@ -0,0 +1 @@ +It provides strong gains over previously released multilingual models like mBERT or XLM on downstream tasks like classification, sequence labeling, and question answering. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_17.txt b/chunked/content_aware_chunking/_multilingual/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..de719a8130815b1fd6daa3a6d7dcdd304c7a9aa8 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_17.txt @@ -0,0 +1,7 @@ +M2M100 +The following M2M100 models can be used for multilingual translation: + +facebook/m2m100_418M (Translation) +facebook/m2m100_1.2B (Translation) + +In this example, load the facebook/m2m100_418M checkpoint to translate from Chinese to English. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_18.txt b/chunked/content_aware_chunking/_multilingual/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..0071e4bc41b333e0aa1759539ea01cbb902b5abd --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_18.txt @@ -0,0 +1,4 @@ +You can set the source language in the tokenizer: + +from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer +en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_19.txt b/chunked/content_aware_chunking/_multilingual/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a042c0b65e0ede9d5e5496f2ff9604426126aca7 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_19.txt @@ -0,0 +1 @@ +chinese_text = "ä¸è¦æ’手巫師的事務, 因為他們是微妙的, 很快就會發怒." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_20.txt b/chunked/content_aware_chunking/_multilingual/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..f98aec7c41063301e36f4658b5b3ce421029a8ab --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_20.txt @@ -0,0 +1,8 @@ +tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") +model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") + +Tokenize the text: + +encoded_zh = tokenizer(chinese_text, return_tensors="pt") + +M2M100 forces the target language id as the first generated token to translate to the target language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_21.txt b/chunked/content_aware_chunking/_multilingual/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5ffc2efdd5e572bf881dcfddebc909a89c87b0b --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_21.txt @@ -0,0 +1,5 @@ +Set the forced_bos_token_id to en in the generate method to translate to English: + +generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) +tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.' \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_22.txt b/chunked/content_aware_chunking/_multilingual/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..edcc1a689693cae0f6c73126d4e31c047053a8e8 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_22.txt @@ -0,0 +1,10 @@ +MBart +The following MBart models can be used for multilingual translation: + +facebook/mbart-large-50-one-to-many-mmt (One-to-many multilingual machine translation, 50 languages) +facebook/mbart-large-50-many-to-many-mmt (Many-to-many multilingual machine translation, 50 languages) +facebook/mbart-large-50-many-to-one-mmt (Many-to-one multilingual machine translation, 50 languages) +facebook/mbart-large-50 (Multilingual translation, 50 languages) +facebook/mbart-large-cc25 + +In this example, load the facebook/mbart-large-50-many-to-many-mmt checkpoint to translate Finnish to English. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_23.txt b/chunked/content_aware_chunking/_multilingual/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..6bea13207ae3f320609b54dad0a0db3579ab0b68 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_23.txt @@ -0,0 +1,4 @@ +You can set the source language in the tokenizer: + +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_24.txt b/chunked/content_aware_chunking/_multilingual/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..296b9a51aa6880b25051fb36440de534e93656a1 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_24.txt @@ -0,0 +1 @@ +fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_25.txt b/chunked/content_aware_chunking/_multilingual/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..844f708e84ba52ccda771b70e184de96b38ef1f6 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_25.txt @@ -0,0 +1,8 @@ +tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") +model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") + +Tokenize the text: + +encoded_en = tokenizer(en_text, return_tensors="pt") + +MBart forces the target language id as the first generated token to translate to the target language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_26.txt b/chunked/content_aware_chunking/_multilingual/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bdbc6566552ba1ad5ef890c9a9e74168f5598a2 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_26.txt @@ -0,0 +1,5 @@ +Set the forced_bos_token_id to en in the generate method to translate to English: + +generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]) +tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_multilingual/chunk_27.txt b/chunked/content_aware_chunking/_multilingual/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca4ca1ddefff73060cf178a4bf86b3ac17905118 --- /dev/null +++ b/chunked/content_aware_chunking/_multilingual/chunk_27.txt @@ -0,0 +1 @@ +If you are using the facebook/mbart-large-50-many-to-one-mmt checkpoint, you don't need to force the target language id as the first generated token otherwise the usage is the same. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_10.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d039d05dace03a142e7a7739009142150edea9e8 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_10.txt @@ -0,0 +1 @@ +Padding will still be applied if you only provide a single sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_11.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c4c05674c138dc3c01848589afda92fb52f15f8 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_11.txt @@ -0,0 +1 @@ +False or 'do_not_pad': no padding is applied. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_12.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..049a5a8078835d009eb59dd034f210db553bc64c --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_12.txt @@ -0,0 +1 @@ +This is the default behavior. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_13.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2ffd8c0c7e436cd6327a8ecd426a3b3d303e897 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_13.txt @@ -0,0 +1 @@ +The truncation argument controls truncation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_14.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..17cbe34d9c3c5779271e35f2510ad0e452cc05ab --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_14.txt @@ -0,0 +1,4 @@ +It can be a boolean or a string: + +True or 'longest_first': truncate to a maximum length specified by the max_length argument or + the maximum length accepted by the model if no max_length is provided (max_length=None). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_15.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..393940a68c468a77d3eeafd842e16cb9b35c22c5 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_15.txt @@ -0,0 +1,3 @@ +This will + truncate token by token, removing a token from the longest sequence in the pair until the proper length is + reached. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_16.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e7f4561b8a9a86ac5f564074cfe09d74abec3bf --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_16.txt @@ -0,0 +1,2 @@ +'only_second': truncate to a maximum length specified by the max_length argument or the maximum + length accepted by the model if no max_length is provided (max_length=None). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_17.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..044c84abbc5822ce50222ed6e5818053a3fe0569 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_17.txt @@ -0,0 +1,2 @@ +This will only truncate + the second sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_18.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa48350cf9a22e8f0c7f4e2671b01f512ed3629d --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_18.txt @@ -0,0 +1,2 @@ +'only_first': truncate to a maximum length specified by the max_length argument or the maximum + length accepted by the model if no max_length is provided (max_length=None). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_19.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..9797409b9bbbb2d6806a28f07fcbde04431413c0 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_19.txt @@ -0,0 +1,2 @@ +This will only truncate + the first sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_20.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..afb7558c6e4077bde81cff82d44112f04def18a7 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_20.txt @@ -0,0 +1 @@ +False or 'do_not_truncate': no truncation is applied. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_21.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..049a5a8078835d009eb59dd034f210db553bc64c --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_21.txt @@ -0,0 +1 @@ +This is the default behavior. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_22.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..d819dc4ab95dcf63c7a9d82273702d8479966731 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_22.txt @@ -0,0 +1 @@ +The max_length argument controls the length of the padding and truncation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_23.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..3190d9599fec8ca2f5850cd5024dd2b2b5c21f84 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_23.txt @@ -0,0 +1 @@ +It can be an integer or None, in which case it will default to the maximum length the model can accept. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_24.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..5442d473cc6741a38dfbe77b42bc42859f892cf2 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_24.txt @@ -0,0 +1 @@ +If the model has no specific maximum input length, truncation or padding to max_length is deactivated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_25.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..0eb3914bcde8e9d779323d03dec6a13ee4040933 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_25.txt @@ -0,0 +1 @@ +The following table summarizes the recommended way to setup padding and truncation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_26.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f96ed4618f3a7fccf85b80770ceea469d68d9ac --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_26.txt @@ -0,0 +1,2 @@ +If you use pairs of input sequences in any of the following examples, you can replace truncation=True by a STRATEGY selected in +['only_first', 'only_second', 'longest_first'], i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_27.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ecca11e0f74413be99dbc68db0a981f83bd1fe4 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_27.txt @@ -0,0 +1 @@ +truncation='only_second' or truncation='longest_first' to control how both sequences in the pair are truncated as detailed before. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_28.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..27aa7faac2a2a130a1cb018d9914344e20ef9097 --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_28.txt @@ -0,0 +1,22 @@ +| Truncation | Padding | Instruction | +|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------| +| no truncation | no padding | tokenizer(batch_sentences) | +| | padding to max sequence in batch | tokenizer(batch_sentences, padding=True) or | +| | | tokenizer(batch_sentences, padding='longest') | +| | padding to max model input length | tokenizer(batch_sentences, padding='max_length') | +| | padding to specific length | tokenizer(batch_sentences, padding='max_length', max_length=42) | +| | padding to a multiple of a value | tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8) | +| truncation to max model input length | no padding | tokenizer(batch_sentences, truncation=True) or | +| | | tokenizer(batch_sentences, truncation=STRATEGY) | +| | padding to max sequence in batch | tokenizer(batch_sentences, padding=True, truncation=True) or | +| | | tokenizer(batch_sentences, padding=True, truncation=STRATEGY) | +| | padding to max model input length | tokenizer(batch_sentences, padding='max_length', truncation=True) or | +| | | tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY) | +| | padding to specific length | Not possible | +| truncation to specific length | no padding | tokenizer(batch_sentences, truncation=True, max_length=42) or | +| | | tokenizer(batch_sentences, truncation=STRATEGY, max_length=42) | +| | padding to max sequence in batch | tokenizer(batch_sentences, padding=True, truncation=True, max_length=42) or | +| | | tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42) | +| | padding to max model input length | Not possible | +| | padding to specific length | tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42) or | +| | | tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42) | \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pad_truncation/chunk_9.txt b/chunked/content_aware_chunking/_pad_truncation/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef19bb111bbc788b16e79fb0b8144cc7647de07b --- /dev/null +++ b/chunked/content_aware_chunking/_pad_truncation/chunk_9.txt @@ -0,0 +1,2 @@ +'max_length': pad to a length specified by the max_length argument or the maximum length accepted + by the model if no max_length is provided (max_length=None). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_14.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..50d60d9bb6d2dde080496b911cda51bd4191c783 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_14.txt @@ -0,0 +1 @@ +Some lower quality ones may not give the card the stable voltage it needs to function at its peak. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_15.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..a028374d982ee45390d4eb4942b69d68696bfd96 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_15.txt @@ -0,0 +1 @@ +And of course the PSU needs to have enough unused Watts to power the card. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_16.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9ff56c17b77b5167ace939364c6ada34711f02d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_16.txt @@ -0,0 +1,2 @@ +Cooling: +When a GPU gets overheated it will start throttling down and will not deliver full performance and it can even shutdown if it gets too hot. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_17.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e604b1eab34b03fd494f8dbea260a5f8c99fa28 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_17.txt @@ -0,0 +1 @@ +It's hard to tell the exact best temperature to strive for when a GPU is heavily loaded, but probably anything under +80C is good, but lower is better - perhaps 70-75C is an excellent range to be in. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_18.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..de8292dfe3592a169eb944a35df961e528b5b98c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_18.txt @@ -0,0 +1 @@ +The throttling down is likely to start at around 84-90C. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_19.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d974b8c0799cf140e395a0d90c2f4b614f9ce93 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_19.txt @@ -0,0 +1 @@ +But other than throttling performance a prolonged very high temperature is likely to reduce the lifespan of a GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_20.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..e14638337f6b46e59a9f9b65e790eeeefb057f32 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_20.txt @@ -0,0 +1 @@ +Next let's have a look at one of the most important aspects when having multiple GPUs: connectivity. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_21.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ab16dfd6398e43d66859e34adc8194207520c4b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_21.txt @@ -0,0 +1,2 @@ +Multi-GPU Connectivity +If you use multiple GPUs the way cards are inter-connected can have a huge impact on the total training time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_22.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..a21af511e9fa7802be4fb4792a36956417ef57ea --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_22.txt @@ -0,0 +1,4 @@ +If the GPUs are on the same physical node, you can run: + +nvidia-smi topo -m +and it will tell you how the GPUs are inter-connected. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_23.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f4933559eab5b172bb5722264b5b7119c1d74bf --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_23.txt @@ -0,0 +1,17 @@ +On a machine with dual-GPU and which are connected with NVLink, you will most likely see something like: +GPU0 GPU1 CPU Affinity NUMA Affinity +GPU0 X NV2 0-23 N/A +GPU1 NV2 X 0-23 N/A +on a different machine w/o NVLink we may see: +GPU0 GPU1 CPU Affinity NUMA Affinity +GPU0 X PHB 0-11 N/A +GPU1 PHB X 0-11 N/A +The report includes this legend: +X = Self + SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + PIX = Connection traversing at most a single PCIe bridge + NV# = Connection traversing a bonded set of # NVLinks +So the first report NV2 tells us the GPUs are interconnected with 2 NVLinks, and the second report PHB we have a typical consumer-level PCIe+Bridge setup. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_24.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..83c000e2ad3c194ef5c9be019e78b3637a4e452a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_24.txt @@ -0,0 +1 @@ +Check what type of connectivity you have on your setup. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_25.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..939fcf5839cddafa9591d1f5a372085c7127c54c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_25.txt @@ -0,0 +1 @@ +Some of these will make the communication between cards faster (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_26.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5aa3f7740b0a644a36500bf3deb5f059202cf15 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_26.txt @@ -0,0 +1 @@ +NVLink), others slower (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_27.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..2dc462a0c6a3feebbb64f10c2dc725d90eab7f52 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_27.txt @@ -0,0 +1 @@ +PHB). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_28.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..acbc60c786195b036f24101ecf27f9f4a5a202b8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_28.txt @@ -0,0 +1 @@ +Depending on the type of scalability solution used, the connectivity speed could have a major or a minor impact. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_29.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..d96a52b33aaa5cd01a5ff806b9bde9247e24dadb --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_29.txt @@ -0,0 +1 @@ +If the GPUs need to sync rarely, as in DDP, the impact of a slower connection will be less significant. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_30.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec27df932f418f1e6720d18577c0e42b64ce25bc --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_30.txt @@ -0,0 +1 @@ +If the GPUs need to send messages to each other often, as in ZeRO-DP, then faster connectivity becomes super important to achieve faster training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_31.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..de1a49f76666b90ea1bd2a891b266bcda532b655 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_31.txt @@ -0,0 +1,2 @@ +NVlink +NVLink is a wire-based serial multi-lane near-range communications link developed by Nvidia. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_32.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..0adddac0304f264cbc6b241c51c86279640f75f1 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_32.txt @@ -0,0 +1 @@ +Each new generation provides a faster bandwidth, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_33.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..1dc390f0631b9a3397ecda61ebfa881b7146978b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_33.txt @@ -0,0 +1,5 @@ +here is a quote from Nvidia Ampere GA102 GPU Architecture: + +Third-Generation NVLink® +GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links, +with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_34.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..4884013147c8bb4fa998a8afbc08e71aad9712d6 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_34.txt @@ -0,0 +1,3 @@ +Four +links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth +between two GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_35.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ed6e1c48154c8466884ef92dad7fd90484f5877 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_35.txt @@ -0,0 +1 @@ +Two RTX 3090 GPUs can be connected together for SLI using NVLink. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_36.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..899100e4ff7f7a6861df79ded98dc21b4737a078 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_36.txt @@ -0,0 +1 @@ +(Note that 3-Way and 4-Way SLI configurations are not supported.) \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_37.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..41499e92f896fb33827eb38ed4644c059e665e1f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_37.txt @@ -0,0 +1 @@ +So the higher X you get in the report of NVX in the output of nvidia-smi topo -m the better. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_38.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e378811bab5a2269c2420f116e3fbab0c9ac25a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_38.txt @@ -0,0 +1 @@ +The generation will depend on your GPU architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_39.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..217a6230e8541b93068cdc3412138958df00beed --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_39.txt @@ -0,0 +1 @@ +Let's compare the execution of a openai-community/gpt2 language model training over a small sample of wikitext. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_40.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b414389728acff50329c8905b586ba5ded8e2e4 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_40.txt @@ -0,0 +1,6 @@ +The results are: +| NVlink | Time | +| ----- | ---: | +| Y | 101s | +| N | 131s | +You can see that NVLink completes the training ~23% faster. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_41.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..84c4ca278669234ec8b839ff8b4f665a8d121a23 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_41.txt @@ -0,0 +1 @@ +In the second benchmark we use NCCL_P2P_DISABLE=1 to tell the GPUs not to use NVLink. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_hardware/chunk_42.txt b/chunked/content_aware_chunking/_perf_hardware/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..72882d938123b8bb6880a837e4f44dae8ecbac18 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_hardware/chunk_42.txt @@ -0,0 +1,17 @@ +Here is the full benchmark code and outputs: +```bash +DDP w/ NVLink +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \ +--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \ +--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \ +--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 +{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69} +DDP w/o NVLink +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \ +--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \ +--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train +--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 +{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69} + +Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (NV2 in nvidia-smi topo -m) +Software: pytorch-1.8-to-be + cuda-11.0 / transformers==4.3.0.dev0 \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_10.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..f57aee0e194c16caea59f59619d6fe0d193170a5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_10.txt @@ -0,0 +1 @@ +Before you start, make sure you have 🤗 Optimum installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_11.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..f931f52c134adf415ba3ee9085cdf9d3efa455f2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_11.txt @@ -0,0 +1,8 @@ +Enable BetterTransformer with the [PreTrainedModel.to_bettertransformer] method: + +from transformers import AutoModelForCausalLM +model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder") +model.to_bettertransformer() + +TorchScript +TorchScript is an intermediate PyTorch model representation that can be run in production environments where performance is important. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_12.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d4a8af0484aab567fea019b7b30ee9683bce317 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_12.txt @@ -0,0 +1 @@ +You can train a model in PyTorch and then export it to TorchScript to free the model from Python performance constraints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_13.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..613c23079bc903e11b8f72a951a01ea97d179cb2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_13.txt @@ -0,0 +1 @@ +PyTorch traces a model to return a [ScriptFunction] that is optimized with just-in-time compilation (JIT). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_14.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..181e85cad60948e99fed5b45c1a5f88609dbde35 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_14.txt @@ -0,0 +1 @@ +Compared to the default eager mode, JIT mode in PyTorch typically yields better performance for inference using optimization techniques like operator fusion. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_15.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c75dc2d568d56b56750608887e8d2e97e3349a1 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_15.txt @@ -0,0 +1 @@ +For a gentle introduction to TorchScript, see the Introduction to PyTorch TorchScript tutorial. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_16.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1d393e9a6f4f35735dcbf195087cafcae4fa90b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_16.txt @@ -0,0 +1,13 @@ +With the [Trainer] class, you can enable JIT mode for CPU inference by setting the --jit_mode_eval flag: + +python run_qa.py \ +--model_name_or_path csarron/bert-base-uncased-squad-v1 \ +--dataset_name squad \ +--do_eval \ +--max_seq_length 384 \ +--doc_stride 128 \ +--output_dir /tmp/ \ +--no_cuda \ +--jit_mode_eval + +For PyTorch >= 1.14.0, JIT-mode could benefit any model for prediction and evaluation since the dict input is supported in jit.trace. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_17.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..407aa761d843abb72a60d29a8c803375f9e567cd --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_17.txt @@ -0,0 +1 @@ +For PyTorch < 1.14.0, JIT-mode could benefit a model if its forward parameter order matches the tuple input order in jit.trace, such as a question-answering model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_18.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a21ba199947e07a748359de2d45b97fa0ca2599 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_18.txt @@ -0,0 +1 @@ +If the forward parameter order does not match the tuple input order in jit.trace, like a text classification model, jit.trace will fail and we are capturing this with the exception here to make it fallback. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_19.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..6179a2f42ea391fae7c528b05eeceda977b249f5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_19.txt @@ -0,0 +1 @@ +Logging is used to notify users. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_20.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ed37fcf877e3032d343dcaf72f9cea7eb279794 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_20.txt @@ -0,0 +1,2 @@ +IPEX graph optimization +Intel® Extension for PyTorch (IPEX) provides further optimizations in JIT mode for Intel CPUs, and we recommend combining it with TorchScript for even faster performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_21.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..15289bef1c12fc55ae911e6421822648cd7573d1 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_21.txt @@ -0,0 +1 @@ +The IPEX graph optimization fuses operations like Multi-head attention, Concat Linear, Linear + Add, Linear + Gelu, Add + LayerNorm, and more. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_22.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..150a76b5cd93fde9194226a1417e8d0217bdd228 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_22.txt @@ -0,0 +1,18 @@ +To take advantage of these graph optimizations, make sure you have IPEX installed: + +pip install intel_extension_for_pytorch +Set the --use_ipex and --jit_mode_eval flags in the [Trainer] class to enable JIT mode with the graph optimizations: + +python run_qa.py \ +--model_name_or_path csarron/bert-base-uncased-squad-v1 \ +--dataset_name squad \ +--do_eval \ +--max_seq_length 384 \ +--doc_stride 128 \ +--output_dir /tmp/ \ +--no_cuda \ +--use_ipex \ +--jit_mode_eval +🤗 Optimum + +Learn more details about using ORT with 🤗 Optimum in the Optimum Inference with ONNX Runtime guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_23.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef51a45648c567815c964174efe4feb0fd0b8e4b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_23.txt @@ -0,0 +1 @@ +This section only provides a brief and simple example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_24.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc9cb1fe3532bf80766dd67a3f260788bb8eaac0 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_24.txt @@ -0,0 +1 @@ +ONNX Runtime (ORT) is a model accelerator that runs inference on CPUs by default. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_25.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..abad55440e7cee8a01a8309ec3ec848662b1efa2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_25.txt @@ -0,0 +1 @@ +ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers, without making too many changes to your code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_26.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d77ff4bd614b86ac197c5a4ba3a83a16d2f1e9ab --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_26.txt @@ -0,0 +1 @@ +You only need to replace the 🤗 Transformers AutoClass with its equivalent [~optimum.onnxruntime.ORTModel] for the task you're solving, and load a checkpoint in the ONNX format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_27.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae8663174331b89f7f76160283451427c6c44731 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_27.txt @@ -0,0 +1,8 @@ +For example, if you're running inference on a question answering task, load the optimum/roberta-base-squad2 checkpoint which contains a model.onnx file: + +from transformers import AutoTokenizer, pipeline +from optimum.onnxruntime import ORTModelForQuestionAnswering +model = ORTModelForQuestionAnswering.from_pretrained("optimum/roberta-base-squad2") +tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2") +onnx_qa = pipeline("question-answering", model=model, tokenizer=tokenizer) +question = "What's my name?" \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_28.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1207aecea370fa66cca85bccd32e6bfabca65b4 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_28.txt @@ -0,0 +1 @@ +context = "My name is Philipp and I live in Nuremberg." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_cpu/chunk_29.txt b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d1d3fa8f602f6dff10362da832e4b341cc07d89 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_cpu/chunk_29.txt @@ -0,0 +1,3 @@ +pred = onnx_qa(question, context) + +If you have an Intel CPU, take a look at 🤗 Optimum Intel which supports a variety of compression techniques (quantization, pruning, knowledge distillation) and tools for converting models to the OpenVINO format for higher performance inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_26.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..65b351ae6426bef28ef98b9e0c8d9895fbc17386 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_26.txt @@ -0,0 +1,19 @@ +By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with torch.backends.cuda.sdp_kernel as a context manager: + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16).to("cuda") +convert the model to BetterTransformer +model.to_bettertransformer() +input_text = "Hello my dog is cute and" +inputs = tokenizer(input_text, return_tensors="pt").to("cuda") + +with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + outputs = model.generate(**inputs) + +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) + +If you see a bug with the traceback below, try using the nightly version of PyTorch which may have broader coverage for FlashAttention: +```bash +RuntimeError: No available kernel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_27.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecd4b6693234ae16cbcbc52d16027607326ccb84 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_27.txt @@ -0,0 +1 @@ +Aborting execution. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_28.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..406cc6ad9d7db7509dc763d6a423112cd817b221 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_28.txt @@ -0,0 +1,6 @@ +install PyTorch nightly +pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 + +BetterTransformer + +Some BetterTransformer features are being upstreamed to Transformers with default support for native torch.nn.scaled_dot_product_attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_29.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..397bc99550c221498db8cf60bc741c9a3d9c9d81 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_29.txt @@ -0,0 +1 @@ +BetterTransformer still has a wider coverage than the Transformers SDPA integration, but you can expect more and more architectures to natively support SDPA in Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_30.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..15e1309d37f150f57728866b9bb9d9f7d8061bdc --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_30.txt @@ -0,0 +1 @@ +Check out our benchmarks with BetterTransformer and scaled dot product attention in the Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0 and learn more about the fastpath execution in the BetterTransformer blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_31.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..c63cee4146c82d6aa97537876587db53cd5a01d2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_31.txt @@ -0,0 +1 @@ +BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_32.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cac088de642942db63fb842d74ddcee863fe7d5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_32.txt @@ -0,0 +1,6 @@ +The two optimizations in the fastpath execution are: + +fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps +skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors + +BetterTransformer also converts all attention operations to use the more memory-efficient scaled dot product attention (SDPA), and it calls optimized kernels like FlashAttention under the hood. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_33.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..f57aee0e194c16caea59f59619d6fe0d193170a5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_33.txt @@ -0,0 +1 @@ +Before you start, make sure you have 🤗 Optimum installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_34.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..939b22bfed76180fc4d211fd48af64d8c45bab51 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_34.txt @@ -0,0 +1,4 @@ +Then you can enable BetterTransformer with the [PreTrainedModel.to_bettertransformer] method: +python +model = model.to_bettertransformer() +You can return the original Transformers model with the [~PreTrainedModel.reverse_bettertransformer] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_35.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..92a6b7fe880c56a7c3a371e2b43f5fe0e4f1e736 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_35.txt @@ -0,0 +1,6 @@ +You should use this before saving your model to use the canonical Transformers modeling: +py +model = model.reverse_bettertransformer() +model.save_pretrained("saved_model") +bitsandbytes +bitsandbytes is a quantization library that includes support for 4-bit and 8-bit quantization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_36.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e23723b0159f5f67746b8e9f54dbac6c83eec1d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_36.txt @@ -0,0 +1 @@ +Quantization reduces your model size compared to its native full precision version, making it easier to fit large models onto GPUs with limited memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_37.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7d8d2b4b9c890c5e05ef23377da4bfc44597cc7 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_37.txt @@ -0,0 +1,9 @@ +Make sure you have bitsandbytes and 🤗 Accelerate installed: +```bash +these versions support 8-bit and 4-bit +pip install bitsandbytes>=0.39.0 accelerate>=0.20.0 +install Transformers +pip install transformers + +4-bit +To load a model in 4-bit for inference, use the load_in_4bit parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_38.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..d79a21d8257d5b431a75593030fb6e0ced628642 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_38.txt @@ -0,0 +1 @@ +The device_map parameter is optional, but we recommend setting it to "auto" to allow 🤗 Accelerate to automatically and efficiently allocate the model given the available resources in the environment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_39.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecdbf2987409e0473f4dfe736543ee417e339f0e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_39.txt @@ -0,0 +1,5 @@ +from transformers import AutoModelForCausalLM +model_name = "bigscience/bloom-2b5" +model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True) + +To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_40.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..209e75a16c4a7544b31e9f00a12c6492756e4d43 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_40.txt @@ -0,0 +1,10 @@ +For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU: +py +max_memory_mapping = {0: "600MB", 1: "1GB"} +model_name = "bigscience/bloom-3b" +model_4bit = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping +) +8-bit + +If you're curious and interested in learning more about the concepts underlying 8-bit quantization, read the Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_41.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..24f72b8ad3d0265dc4953a6ec13d72b64d5af139 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_41.txt @@ -0,0 +1 @@ +To load a model in 8-bit for inference, use the load_in_8bit parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_42.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..99b6c2a7858193160f96e3ac7b7015f736f44a48 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_42.txt @@ -0,0 +1,7 @@ +The device_map parameter is optional, but we recommend setting it to "auto" to allow 🤗 Accelerate to automatically and efficiently allocate the model given the available resources in the environment: + +from transformers import AutoModelForCausalLM +model_name = "bigscience/bloom-2b5" +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) + +If you're loading a model in 8-bit for text generation, you should use the [~transformers.GenerationMixin.generate] method instead of the [Pipeline] function which is not optimized for 8-bit models and will be slower. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_43.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..a211224317f81fb0a043bedbe2c3eb7b12e12a0c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_43.txt @@ -0,0 +1 @@ +Some sampling strategies, like nucleus sampling, are also not supported by the [Pipeline] for 8-bit models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_44.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..de684a8821f12e2a9eb4637d498f53aaf0791687 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_44.txt @@ -0,0 +1,12 @@ +You should also place all inputs on the same device as the model: + +from transformers import AutoModelForCausalLM, AutoTokenizer +model_name = "bigscience/bloom-2b5" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +prompt = "Hello, my llama is cute" +inputs = tokenizer(prompt, return_tensors="pt").to("cuda") +generated_ids = model.generate(**inputs) +outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + +To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_45.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc78505b8aaa6227b9c4106ed94737805836d463 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_45.txt @@ -0,0 +1,9 @@ +For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU: +py +max_memory_mapping = {0: "1GB", 1: "2GB"} +model_name = "bigscience/bloom-3b" +model_8bit = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping +) + +Feel free to try running a 11 billion parameter T5 model or the 3 billion parameter BLOOM model for inference on Google Colab's free tier GPUs! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_46.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fc4cb95e11357167973d8d531f4b9f589294a18 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_46.txt @@ -0,0 +1,3 @@ +🤗 Optimum + +Learn more details about using ORT with 🤗 Optimum in the Accelerated inference on NVIDIA GPUs and Accelerated inference on AMD GPUs guides. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_47.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef51a45648c567815c964174efe4feb0fd0b8e4b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_47.txt @@ -0,0 +1 @@ +This section only provides a brief and simple example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_48.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..914ab93575f3dba8c8e64a0ff8c6ebbc918ce8c1 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_48.txt @@ -0,0 +1 @@ +ONNX Runtime (ORT) is a model accelerator that supports accelerated inference on Nvidia GPUs, and AMD GPUs that use ROCm stack. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_49.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..881df7fa74c06ce7658010e3170e87d6ee936d37 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_49.txt @@ -0,0 +1 @@ +ORT uses optimization techniques like fusing common operations into a single node and constant folding to reduce the number of computations performed and speedup inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_50.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6a3b3b502a51235035fb7e65df1da9a5aa0f6a3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_50.txt @@ -0,0 +1 @@ +ORT also places the most computationally intensive operations on the GPU and the rest on the CPU to intelligently distribute the workload between the two devices. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_51.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..f67458d851172d7b2df58ee091b02737a63a8691 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_51.txt @@ -0,0 +1 @@ +ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_52.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..668be4ac8c0e63f11c28dfc0e6c072c3d387f780 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_52.txt @@ -0,0 +1 @@ +You'll need to use an [~optimum.onnxruntime.ORTModel] for the task you're solving, and specify the provider parameter which can be set to either CUDAExecutionProvider, ROCMExecutionProvider or TensorrtExecutionProvider. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_53.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e8aee7cfe906fbaa0afb93ea807ba3f6574ef7 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_53.txt @@ -0,0 +1,16 @@ +If you want to load a model that was not yet exported to ONNX, you can set export=True to convert your model on-the-fly to the ONNX format: + +from optimum.onnxruntime import ORTModelForSequenceClassification +ort_model = ORTModelForSequenceClassification.from_pretrained( + "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + export=True, + provider="CUDAExecutionProvider", +) + +Now you're free to use the model for inference: + +from optimum.pipelines import pipeline +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english") +pipeline = pipeline(task="text-classification", model=ort_model, tokenizer=tokenizer, device="cuda:0") +result = pipeline("Both the music and visual were astounding, not to mention the actors performance.") \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_54.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c8bc1e2006f89cb8c6422935273a2ef1006f858 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_54.txt @@ -0,0 +1,2 @@ +Combine optimizations +It is often possible to combine several of the optimization techniques described above to get the best inference performance possible for your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_55.txt b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..f739613aaf689b704f1e9ddc7ee915dc77cb303f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_infer_gpu_one/chunk_55.txt @@ -0,0 +1,20 @@ +For example, you can load a model in 4-bit, and then enable BetterTransformer with FlashAttention: + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +load model in 4-bit +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16 +) +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config) +enable BetterTransformer +model = model.to_bettertransformer() +input_text = "Hello my dog is cute and" +inputs = tokenizer(input_text, return_tensors="pt").to("cuda") +enable FlashAttention +with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + outputs = model.generate(**inputs) +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_10.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb215719551bb3fe33b9ceb3a20b72827d5e12fb --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_10.txt @@ -0,0 +1,2 @@ +Benchmarking code +Below you can find the benchmarking code for each task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_11.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f1adfe2438f28e531ccdccc707ec13fb8e12343 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_11.txt @@ -0,0 +1 @@ +We warm up the GPU before inference and take the mean time of 300 inferences, using the same image each time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_12.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a3e9ab1b05e640e921f865fa36e65ef2f46ce45 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_12.txt @@ -0,0 +1,38 @@ +Image Classification with ViT +thon +import torch +from PIL import Image +import requests +import numpy as np +from transformers import AutoImageProcessor, AutoModelForImageClassification +url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +image = Image.open(requests.get(url, stream=True).raw) +processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") +model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to("cuda") +model = torch.compile(model) +processed_input = processor(image, return_tensors='pt').to(device="cuda") +with torch.no_grad(): + _ = model(**processed_input) + +Object Detection with DETR +thon +from transformers import AutoImageProcessor, AutoModelForObjectDetection +processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") +model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to("cuda") +model = torch.compile(model) +texts = ["a photo of a cat", "a photo of a dog"] +inputs = processor(text=texts, images=image, return_tensors="pt").to("cuda") +with torch.no_grad(): + _ = model(**inputs) + +Image Segmentation with Segformer +thon +from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation +processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") +model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to("cuda") +model = torch.compile(model) +seg_inputs = processor(images=image, return_tensors="pt").to("cuda") +with torch.no_grad(): + _ = model(**seg_inputs) + +Below you can find the list of the models we benchmarked. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_13.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e949f85ff5e176f34b563cd94860bec9ab27319 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_13.txt @@ -0,0 +1,15 @@ +Image Classification +- google/vit-base-patch16-224 +- microsoft/beit-base-patch16-224-pt22k-ft22k +- facebook/convnext-large-224 +- microsoft/resnet-50 +Image Segmentation +- nvidia/segformer-b0-finetuned-ade-512-512 +- facebook/mask2former-swin-tiny-coco-panoptic +- facebook/maskformer-swin-base-ade +- google/deeplabv3_mobilenet_v2_1.0_513 +Object Detection +- google/owlvit-base-patch32 +- facebook/detr-resnet-101 +- microsoft/conditional-detr-resnet-50 +Below you can find visualization of inference durations with and without torch.compile() and percentage improvements for each model in different hardware and batch sizes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_14.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbb4429b3eb45a439fb7f52e72f928ea6ce47c44 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_14.txt @@ -0,0 +1 @@ +Below you can find inference durations in milliseconds for each model with and without compile(). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_15.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8d07bebf2f5320fcf71b5f4003cbb4599afc629 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_15.txt @@ -0,0 +1 @@ +Note that OwlViT results in OOM in larger batch sizes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_16.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c14c7e64d9c883a7f9929f64d7b42ab12edaee3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_16.txt @@ -0,0 +1,131 @@ +A100 (batch size: 1) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 9.325 | 7.584 | +| Image Segmentation/Segformer | 11.759 | 10.500 | +| Object Detection/OwlViT | 24.978 | 18.420 | +| Image Classification/BeiT | 11.282 | 8.448 | +| Object Detection/DETR | 34.619 | 19.040 | +| Image Classification/ConvNeXT | 10.410 | 10.208 | +| Image Classification/ResNet | 6.531 | 4.124 | +| Image Segmentation/Mask2former | 60.188 | 49.117 | +| Image Segmentation/Maskformer | 75.764 | 59.487 | +| Image Segmentation/MobileNet | 8.583 | 3.974 | +| Object Detection/Resnet-101 | 36.276 | 18.197 | +| Object Detection/Conditional-DETR | 31.219 | 17.993 | +A100 (batch size: 4) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 14.832 | 14.499 | +| Image Segmentation/Segformer | 18.838 | 16.476 | +| Image Classification/BeiT | 13.205 | 13.048 | +| Object Detection/DETR | 48.657 | 32.418| +| Image Classification/ConvNeXT | 22.940 | 21.631 | +| Image Classification/ResNet | 6.657 | 4.268 | +| Image Segmentation/Mask2former | 74.277 | 61.781 | +| Image Segmentation/Maskformer | 180.700 | 159.116 | +| Image Segmentation/MobileNet | 14.174 | 8.515 | +| Object Detection/Resnet-101 | 68.101 | 44.998 | +| Object Detection/Conditional-DETR | 56.470 | 35.552 | +A100 (batch size: 16) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 40.944 | 40.010 | +| Image Segmentation/Segformer | 37.005 | 31.144 | +| Image Classification/BeiT | 41.854 | 41.048 | +| Object Detection/DETR | 164.382 | 161.902 | +| Image Classification/ConvNeXT | 82.258 | 75.561 | +| Image Classification/ResNet | 7.018 | 5.024 | +| Image Segmentation/Mask2former | 178.945 | 154.814 | +| Image Segmentation/Maskformer | 638.570 | 579.826 | +| Image Segmentation/MobileNet | 51.693 | 30.310 | +| Object Detection/Resnet-101 | 232.887 | 155.021 | +| Object Detection/Conditional-DETR | 180.491 | 124.032 | +V100 (batch size: 1) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 10.495 | 6.00 | +| Image Segmentation/Segformer | 13.321 | 5.862 | +| Object Detection/OwlViT | 25.769 | 22.395 | +| Image Classification/BeiT | 11.347 | 7.234 | +| Object Detection/DETR | 33.951 | 19.388 | +| Image Classification/ConvNeXT | 11.623 | 10.412 | +| Image Classification/ResNet | 6.484 | 3.820 | +| Image Segmentation/Mask2former | 64.640 | 49.873 | +| Image Segmentation/Maskformer | 95.532 | 72.207 | +| Image Segmentation/MobileNet | 9.217 | 4.753 | +| Object Detection/Resnet-101 | 52.818 | 28.367 | +| Object Detection/Conditional-DETR | 39.512 | 20.816 | +V100 (batch size: 4) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 15.181 | 14.501 | +| Image Segmentation/Segformer | 16.787 | 16.188 | +| Image Classification/BeiT | 15.171 | 14.753 | +| Object Detection/DETR | 88.529 | 64.195 | +| Image Classification/ConvNeXT | 29.574 | 27.085 | +| Image Classification/ResNet | 6.109 | 4.731 | +| Image Segmentation/Mask2former | 90.402 | 76.926 | +| Image Segmentation/Maskformer | 234.261 | 205.456 | +| Image Segmentation/MobileNet | 24.623 | 14.816 | +| Object Detection/Resnet-101 | 134.672 | 101.304 | +| Object Detection/Conditional-DETR | 97.464 | 69.739 | +V100 (batch size: 16) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 52.209 | 51.633 | +| Image Segmentation/Segformer | 61.013 | 55.499 | +| Image Classification/BeiT | 53.938 | 53.581 | +| Object Detection/DETR | OOM | OOM | +| Image Classification/ConvNeXT | 109.682 | 100.771 | +| Image Classification/ResNet | 14.857 | 12.089 | +| Image Segmentation/Mask2former | 249.605 | 222.801 | +| Image Segmentation/Maskformer | 831.142 | 743.645 | +| Image Segmentation/MobileNet | 93.129 | 55.365 | +| Object Detection/Resnet-101 | 482.425 | 361.843 | +| Object Detection/Conditional-DETR | 344.661 | 255.298 | +T4 (batch size: 1) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 16.520 | 15.786 | +| Image Segmentation/Segformer | 16.116 | 14.205 | +| Object Detection/OwlViT | 53.634 | 51.105 | +| Image Classification/BeiT | 16.464 | 15.710 | +| Object Detection/DETR | 73.100 | 53.99 | +| Image Classification/ConvNeXT | 32.932 | 30.845 | +| Image Classification/ResNet | 6.031 | 4.321 | +| Image Segmentation/Mask2former | 79.192 | 66.815 | +| Image Segmentation/Maskformer | 200.026 | 188.268 | +| Image Segmentation/MobileNet | 18.908 | 11.997 | +| Object Detection/Resnet-101 | 106.622 | 82.566 | +| Object Detection/Conditional-DETR | 77.594 | 56.984 | +T4 (batch size: 4) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 43.653 | 43.626 | +| Image Segmentation/Segformer | 45.327 | 42.445 | +| Image Classification/BeiT | 52.007 | 51.354 | +| Object Detection/DETR | 277.850 | 268.003 | +| Image Classification/ConvNeXT | 119.259 | 105.580 | +| Image Classification/ResNet | 13.039 | 11.388 | +| Image Segmentation/Mask2former | 201.540 | 184.670 | +| Image Segmentation/Maskformer | 764.052 | 711.280 | +| Image Segmentation/MobileNet | 74.289 | 48.677 | +| Object Detection/Resnet-101 | 421.859 | 357.614 | +| Object Detection/Conditional-DETR | 289.002 | 226.945 | +T4 (batch size: 16) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 163.914 | 160.907 | +| Image Segmentation/Segformer | 192.412 | 163.620 | +| Image Classification/BeiT | 188.978 | 187.976 | +| Object Detection/DETR | OOM | OOM | +| Image Classification/ConvNeXT | 422.886 | 388.078 | +| Image Classification/ResNet | 44.114 | 37.604 | +| Image Segmentation/Mask2former | 756.337 | 695.291 | +| Image Segmentation/Maskformer | 2842.940 | 2656.88 | +| Image Segmentation/MobileNet | 299.003 | 201.942 | +| Object Detection/Resnet-101 | 1619.505 | 1262.758 | +| Object Detection/Conditional-DETR | 1137.513 | 897.390| +PyTorch Nightly +We also benchmarked on PyTorch nightly (2.1.0dev, find the wheel here) and observed improvement in latency both for uncompiled and compiled models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_17.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..be524d2373f840b06772e2639399729546a7c800 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_17.txt @@ -0,0 +1,29 @@ +A100 +| Task/Model | Batch Size | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:|:---:| +| Image Classification/BeiT | Unbatched | 12.462 | 6.954 | +| Image Classification/BeiT | 4 | 14.109 | 12.851 | +| Image Classification/BeiT | 16 | 42.179 | 42.147 | +| Object Detection/DETR | Unbatched | 30.484 | 15.221 | +| Object Detection/DETR | 4 | 46.816 | 30.942 | +| Object Detection/DETR | 16 | 163.749 | 163.706 | +T4 +| Task/Model | Batch Size | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:|:---:| +| Image Classification/BeiT | Unbatched | 14.408 | 14.052 | +| Image Classification/BeiT | 4 | 47.381 | 46.604 | +| Image Classification/BeiT | 16 | 42.179 | 42.147 | +| Object Detection/DETR | Unbatched | 68.382 | 53.481 | +| Object Detection/DETR | 4 | 269.615 | 204.785 | +| Object Detection/DETR | 16 | OOM | OOM | +V100 +| Task/Model | Batch Size | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:|:---:| +| Image Classification/BeiT | Unbatched | 13.477 | 7.926 | +| Image Classification/BeiT | 4 | 15.103 | 14.378 | +| Image Classification/BeiT | 16 | 52.517 | 51.691 | +| Object Detection/DETR | Unbatched | 28.706 | 19.077 | +| Object Detection/DETR | 4 | 88.402 | 62.949| +| Object Detection/DETR | 16 | OOM | OOM | +Reduce Overhead +We benchmarked reduce-overhead compilation mode for A100 and T4 in Nightly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_18.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4ed1d82c694cfa3765b2ca7434d09bb1e8d8a3f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_18.txt @@ -0,0 +1,22 @@ +A100 +| Task/Model | Batch Size | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:|:---:| +| Image Classification/ConvNeXT | Unbatched | 11.758 | 7.335 | +| Image Classification/ConvNeXT | 4 | 23.171 | 21.490 | +| Image Classification/ResNet | Unbatched | 7.435 | 3.801 | +| Image Classification/ResNet | 4 | 7.261 | 2.187 | +| Object Detection/Conditional-DETR | Unbatched | 32.823 | 11.627 | +| Object Detection/Conditional-DETR | 4 | 50.622 | 33.831 | +| Image Segmentation/MobileNet | Unbatched | 9.869 | 4.244 | +| Image Segmentation/MobileNet | 4 | 14.385 | 7.946 | +T4 +| Task/Model | Batch Size | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:|:---:| +| Image Classification/ConvNeXT | Unbatched | 32.137 | 31.84 | +| Image Classification/ConvNeXT | 4 | 120.944 | 110.209 | +| Image Classification/ResNet | Unbatched | 9.761 | 7.698 | +| Image Classification/ResNet | 4 | 15.215 | 13.871 | +| Object Detection/Conditional-DETR | Unbatched | 72.150 | 57.660 | +| Object Detection/Conditional-DETR | 4 | 301.494 | 247.543 | +| Image Segmentation/MobileNet | Unbatched | 22.266 | 19.339 | +| Image Segmentation/MobileNet | 4 | 78.311 | 50.983 | \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_7.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..07066ff9e037aef8d24e5e1fc3695222fba9ddc6 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_7.txt @@ -0,0 +1 @@ +In this guide, we used the default mode. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_8.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e8ea0d6388d834909d24f3cc664d7c0570ffd2f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_8.txt @@ -0,0 +1 @@ +You can learn more about it here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_torch_compile/chunk_9.txt b/chunked/content_aware_chunking/_perf_torch_compile/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..499274be601623aa6e82db9ca477e3bfed13be4e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_torch_compile/chunk_9.txt @@ -0,0 +1 @@ +We benchmarked torch.compile with different computer vision models, tasks, types of hardware, and batch sizes on torch version 2.0.1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu/chunk_10.txt b/chunked/content_aware_chunking/_perf_train_cpu/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cedec3795429275a6d663f03f6a0c3f8e120cd7 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu/chunk_10.txt @@ -0,0 +1,2 @@ +pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu +You can check the latest versions in ipex-whl-stable-cpu if needed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu/chunk_11.txt b/chunked/content_aware_chunking/_perf_train_cpu/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fecbc48482b633374d5d8eee37708d75be36f78 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu/chunk_11.txt @@ -0,0 +1 @@ +Check more approaches for IPEX installation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu/chunk_12.txt b/chunked/content_aware_chunking/_perf_train_cpu/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..052fd4e10aaf8ff1232a98287bcd12a70ee7a0e1 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu/chunk_12.txt @@ -0,0 +1,2 @@ +Usage in Trainer +To enable auto mixed precision with IPEX in Trainer, users should add use_ipex, bf16 and no_cuda in training command arguments. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu/chunk_13.txt b/chunked/content_aware_chunking/_perf_train_cpu/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4ff12c7020bb874828d9dd312d0272cea05bf86 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu/chunk_13.txt @@ -0,0 +1,29 @@ +Take an example of the use cases on Transformers question-answering + +Training with IPEX using BF16 auto mixed precision on CPU: + + python run_qa.py \ +--model_name_or_path google-bert/bert-base-uncased \ +--dataset_name squad \ +--do_train \ +--do_eval \ +--per_device_train_batch_size 12 \ +--learning_rate 3e-5 \ +--num_train_epochs 2 \ +--max_seq_length 384 \ +--doc_stride 128 \ +--output_dir /tmp/debug_squad/ \ +--use_ipex \ +--bf16 \ +--use_cpu +If you want to enable use_ipex and bf16 in your script, add these parameters to TrainingArguments like this: +diff +training_args = TrainingArguments( + output_dir=args.output_path, ++ bf16=True, ++ use_ipex=True, ++ use_cpu=True, + **kwargs +) +Practice example +Blog: Accelerating PyTorch Transformers with Intel Sapphire Rapids \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu/chunk_4.txt b/chunked/content_aware_chunking/_perf_train_cpu/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..495bc1c1ba43b3c14fe1568a494e91891c04a788 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu/chunk_4.txt @@ -0,0 +1 @@ +However, CPUs with only AVX2 (e.g., AMD or older Intel CPUs) are not guaranteed to have better performance under IPEX. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu/chunk_5.txt b/chunked/content_aware_chunking/_perf_train_cpu/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..d869cd4f02404cf38d7fad9dcfd9efc978ab90b2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu/chunk_5.txt @@ -0,0 +1 @@ +Auto Mixed Precision (AMP) for CPU backends has been enabled since PyTorch 1.10. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu/chunk_6.txt b/chunked/content_aware_chunking/_perf_train_cpu/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd66b618187a8e3ad3b098ea2152da1051a07f78 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu/chunk_6.txt @@ -0,0 +1 @@ +AMP support for bf16 on CPUs and bf16 operator optimization is also supported in IPEX and partially upstreamed to the main PyTorch branch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu/chunk_7.txt b/chunked/content_aware_chunking/_perf_train_cpu/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..88db5d5c2a500f6e164bae9275261beef5aa57c3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu/chunk_7.txt @@ -0,0 +1 @@ +You can get better performance and user experience with IPEX AMP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu/chunk_8.txt b/chunked/content_aware_chunking/_perf_train_cpu/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..5214c9dafecbd9ee3a2b7cc0aebfb0e98e6f0ad6 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu/chunk_8.txt @@ -0,0 +1 @@ +Check more detailed information for Auto Mixed Precision. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu/chunk_9.txt b/chunked/content_aware_chunking/_perf_train_cpu/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..661b22aa14cf043a61cde7998561bbf20cdf9d77 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu/chunk_9.txt @@ -0,0 +1,9 @@ +IPEX installation: +IPEX release is following PyTorch, to install via pip: +| PyTorch Version | IPEX version | +| :---------------: | :----------: | +| 2.1.x | 2.1.100+cpu | +| 2.0.x | 2.0.100+cpu | +| 1.13 | 1.13.0+cpu | +| 1.12 | 1.12.300+cpu | +Please run pip list | grep torch to get your pytorch_version, so you can get the IPEX version_name. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_15.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3aa0e92aabc924b564b1c820020cbc9f53377902 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_15.txt @@ -0,0 +1,2 @@ +Usage in Trainer +To enable multi CPU distributed training in the Trainer with the ccl backend, users should add --ddp_backend ccl in the command arguments. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_16.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bd3d3248d3001114a5486d2cea931ba45530861 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_16.txt @@ -0,0 +1,2 @@ +Let's see an example with the question-answering example +The following command enables training with 2 processes on one Xeon node, with one process running per one socket. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_17.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..d19a98019cd146232c62dd7c1e452edd65df516e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_17.txt @@ -0,0 +1 @@ +The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_18.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b652ece8cfe4d5946fa13aeaa9e196267ba493b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_18.txt @@ -0,0 +1,19 @@ +shell script + export CCL_WORKER_COUNT=1 + export MASTER_ADDR=127.0.0.1 + mpirun -n 2 -genv OMP_NUM_THREADS=23 \ + python3 run_qa.py \ + --model_name_or_path google-bert/bert-large-uncased \ + --dataset_name squad \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir /tmp/debug_squad/ \ + --no_cuda \ + --ddp_backend ccl \ + --use_ipex +The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_19.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..d19a98019cd146232c62dd7c1e452edd65df516e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_19.txt @@ -0,0 +1 @@ +The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_20.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..53e300bd3b3149ea634acaa807f048f86d15041d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_20.txt @@ -0,0 +1 @@ +In node0, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_21.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f018ef0751fdab11e14166c7fa553e9120d9e3b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_21.txt @@ -0,0 +1,28 @@ +shell script + cat hostfile + xxx.xxx.xxx.xxx #node0 ip + xxx.xxx.xxx.xxx #node1 ip +Now, run the following command in node0 and 4DDP will be enabled in node0 and node1 with BF16 auto mixed precision: +shell script + export CCL_WORKER_COUNT=1 + export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip + mpirun -f hostfile -n 4 -ppn 2 \ + -genv OMP_NUM_THREADS=23 \ + python3 run_qa.py \ + --model_name_or_path google-bert/bert-large-uncased \ + --dataset_name squad \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir /tmp/debug_squad/ \ + --no_cuda \ + --ddp_backend ccl \ + --use_ipex \ + --bf16 +Usage with Kubernetes +The same distributed training job from the previous section can be deployed to a Kubernetes cluster using the +Kubeflow PyTorchJob training operator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_22.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..21816c1d820b0b6cc3e013f3f3ca80c426e3e538 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_22.txt @@ -0,0 +1,6 @@ +Setup +This example assumes that you have: +* Access to a Kubernetes cluster with Kubeflow installed +* kubectl installed and configured to access the Kubernetes cluster +* A Persistent Volume Claim (PVC) that can be used + to store datasets and model files. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_23.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..393c6aa906cd0f538433d1eb29122bd892907d59 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_23.txt @@ -0,0 +1,2 @@ +There are multiple options for setting up the PVC including using an NFS + storage class or a cloud storage bucket. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_24.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed49155ef07781568135f918dade2c828a593c95 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_24.txt @@ -0,0 +1 @@ +* A Docker container that includes your model training script and all the dependencies needed to run the script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_25.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e79f8d1f1adc8811fa15133127ffabf1aded002 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_25.txt @@ -0,0 +1,3 @@ +For + distributed CPU training jobs, this typically includes PyTorch, Transformers, Intel Extension for PyTorch, Intel + oneCCL Bindings for PyTorch, and OpenSSH to communicate between the containers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_26.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1ff39ffcd71ccc774399a532ac7a3273b2922b8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_26.txt @@ -0,0 +1,12 @@ +The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then +extracts a Transformers release to the /workspace directory, so that the example scripts are included in the image: +```dockerfile +FROM intel/ai-workflows:torch-2.0.1-huggingface-multinode-py3.9 +WORKDIR /workspace +Download and extract the transformers code +ARG HF_TRANSFORMERS_VER="4.35.2" +RUN mkdir transformers && \ + curl -sSL --retry 5 https://github.com/huggingface/transformers/archive/refs/tags/v${HF_TRANSFORMERS_VER}.tar.gz | tar -C transformers --strip-components=1 -xzf - + +The image needs to be built and copied to the cluster's nodes or pushed to a container registry prior to deploying the +PyTorchJob to the cluster. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_27.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1411c9784863c8370ea0fbed771c2ae21aa80a0 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_27.txt @@ -0,0 +1,3 @@ +PyTorchJob Specification File +The Kubeflow PyTorchJob is used to run the distributed +training job on the cluster. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_28.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d6a5bd30d2255773083f7dfc87acac505afbb92 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_28.txt @@ -0,0 +1,9 @@ +The yaml file for the PyTorchJob defines parameters such as: + * The name of the PyTorchJob + * The number of replicas (workers) + * The python script and it's parameters that will be used to run the training job + * The types of resources (node selector, memory, and CPU) needed for each worker + * The image/tag for the Docker container to use + * Environment variables + * A volume mount for the PVC +The volume mount defines a path where the PVC will be mounted in the container for each worker pod. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_29.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e88254897272896ae0150e22a9d69f3d6134a00 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_29.txt @@ -0,0 +1,2 @@ +This location can be +used for the dataset, checkpoint files, and the saved model after training completes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_30.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a43cc5a526929fd94c5f49eef832c4127243e8d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_30.txt @@ -0,0 +1,2 @@ +The snippet below is an example of a yaml file for a PyTorchJob with 4 workers running the +question-answering example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_31.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbf812415ca008eaa3f49c177cc724d4185ff8ce --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_31.txt @@ -0,0 +1,84 @@ +yaml +apiVersion: "kubeflow.org/v1" +kind: PyTorchJob +metadata: + name: transformers-pytorchjob + namespace: kubeflow +spec: + elasticPolicy: + rdzvBackend: c10d + minReplicas: 1 + maxReplicas: 4 + maxRestarts: 10 + pytorchReplicaSpecs: + Worker: + replicas: 4 # The number of worker pods + restartPolicy: OnFailure + template: + spec: + containers: + - name: pytorch + image: : # Specify the docker image to use for the worker pods + imagePullPolicy: IfNotPresent + command: + - torchrun + - /workspace/transformers/examples/pytorch/question-answering/run_qa.py + - --model_name_or_path + - "google-bert/bert-large-uncased" + - --dataset_name + - "squad" + - --do_train + - --do_eval + - --per_device_train_batch_size + - "12" + - --learning_rate + - "3e-5" + - --num_train_epochs + - "2" + - --max_seq_length + - "384" + - --doc_stride + - "128" + - --output_dir + - "/tmp/pvc-mount/output" + - --no_cuda + - --ddp_backend + - "ccl" + - --use_ipex + - --bf16 # Specify --bf16 if your hardware supports bfloat16 + env: + - name: LD_PRELOAD + value: "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4.5.9:/usr/local/lib/libiomp5.so" + - name: TRANSFORMERS_CACHE + value: "/tmp/pvc-mount/transformers_cache" + - name: HF_DATASETS_CACHE + value: "/tmp/pvc-mount/hf_datasets_cache" + - name: LOGLEVEL + value: "INFO" + - name: CCL_WORKER_COUNT + value: "1" + - name: OMP_NUM_THREADS # Can be tuned for optimal performance + + resources: + limits: + cpu: 200 # Update the CPU and memory limit values based on your nodes + memory: 128Gi + requests: + cpu: 200 # Update the CPU and memory request values based on your nodes + memory: 128Gi + volumeMounts: + - name: pvc-volume + mountPath: /tmp/pvc-mount + - mountPath: /dev/shm + name: dshm + restartPolicy: Never + nodeSelector: # Optionally use the node selector to specify what types of nodes to use for the workers + node-type: spr + volumes: + - name: pvc-volume + persistentVolumeClaim: + claimName: transformers-pvc + - name: dshm + emptyDir: + medium: Memory +To run this example, update the yaml based on your training script and the nodes in your cluster. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_32.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d8351015521342c75d2f1a328ae0cd54a15d0f1 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_32.txt @@ -0,0 +1,3 @@ +The CPU resource limits/requests in the yaml are defined in cpu units +where 1 CPU unit is equivalent to 1 physical CPU core or 1 virtual core (depending on whether the node is a physical +host or a VM). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_33.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2658e6080cfff74fe5b8b1646fc58184edba278 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_33.txt @@ -0,0 +1,2 @@ +The amount of CPU and memory limits/requests defined in the yaml should be less than the amount of +available CPU/memory capacity on a single machine. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_34.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..082ee4bbdacd001a7d3fa8ddd2838362e2e10290 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_34.txt @@ -0,0 +1,2 @@ +It is usually a good idea to not use the entire machine's capacity in +order to leave some resources for the kubelet and OS. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_35.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..4583b077346e94e33a725e5eef2b6884963868b4 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_35.txt @@ -0,0 +1,3 @@ +In order to get "guaranteed" +quality of service for the worker pods, +set the same CPU and memory amounts for both the resource limits and requests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_36.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..62a5c6ad25e9b461b71457f8b5b4dac45f555828 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_36.txt @@ -0,0 +1,6 @@ +Deploy +After the PyTorchJob spec has been updated with values appropriate for your cluster and training job, it can be deployed +to the cluster using: + +kubectl create -f pytorchjob.yaml +The kubectl get pods -n kubeflow command can then be used to list the pods in the kubeflow namespace. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_37.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d9165035645bb459e9d359a39f62654e8f288f8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_37.txt @@ -0,0 +1,2 @@ +You should see +the worker pods for the PyTorchJob that was just deployed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_38.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..04e7f5b14aa79d630ede11229192ba54af564a63 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_38.txt @@ -0,0 +1,2 @@ +At first, they will probably have a status of "Pending" as +the containers get pulled and created, then the status should change to "Running". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_39.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..14a3dc0966b77c8fbc00c424d6da93efbeb28d4a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_39.txt @@ -0,0 +1,8 @@ +NAME READY STATUS RESTARTS AGE + +transformers-pytorchjob-worker-0 1/1 Running 0 7m37s +transformers-pytorchjob-worker-1 1/1 Running 0 7m37s +transformers-pytorchjob-worker-2 1/1 Running 0 7m37s +transformers-pytorchjob-worker-3 1/1 Running 0 7m37s + +The logs for worker can be viewed using kubectl logs -n kubeflow . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_40.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfa73f43b017248eefec70941e63448151e5ff7d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_40.txt @@ -0,0 +1,4 @@ +Add -f to stream the logs, for example: + +kubectl logs -n kubeflow transformers-pytorchjob-worker-0 -f +After the training job completes, the trained model can be copied from the PVC or storage location. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_41.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..c10f18dfa7fd55af93e5327f7750e4c03fd1fe74 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_41.txt @@ -0,0 +1,2 @@ +When you are done +with the job, the PyTorchJob resource can be deleted from the cluster using kubectl delete -f pytorchjob.yaml. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_42.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..460c5186d64b525a8e4c877594242c2d9599ec47 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_42.txt @@ -0,0 +1,3 @@ +Summary +This guide covered running distributed PyTorch training jobs using multiple CPUs on bare metal and on a Kubernetes +cluster. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_43.txt b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..c200dd4285edcc01f94f49002d928ce3d301589c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_cpu_many/chunk_43.txt @@ -0,0 +1,2 @@ +Both cases utilize Intel Extension for PyTorch and Intel oneCCL Bindings for PyTorch for optimal training +performance, and can be used as a template to run your own workload on multiple nodes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_100.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..632391349addb332cd549bca3210300687860cdd --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_100.txt @@ -0,0 +1,2 @@ +You can see that there's a forward path of 4 pipe stages (F0, F1, F2 and F3) followed by +a backward path in reverse order (B3, B2, B1, and B0). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_101.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e049ee099ff0bc3c5b85e4b0be5f6de66c19c88 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_101.txt @@ -0,0 +1,2 @@ +PP introduces a new hyperparameter to tune - chunks, which determines how many data chunks are sent in a sequence +through the same pipe stage. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_102.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f7a1b9ac01c05ba042344797be616260d4ebc89 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_102.txt @@ -0,0 +1 @@ +For example, in the bottom diagram you can see chunks=4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_103.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..b45dcf391a4b27db480f0c4c96a3fd67d0d064fa --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_103.txt @@ -0,0 +1,2 @@ +GPU0 performs the same +forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do complete their work. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_104.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..643ab0b56c37c82fcc6f38865f4721f985969bbb --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_104.txt @@ -0,0 +1,2 @@ +Only when the other GPUs begin to complete their work, GPU0 starts to work again doing the backward path for chunks +3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_105.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..18434bc77ae2d45b41b09892a6beb0d61c1e6cb2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_105.txt @@ -0,0 +1 @@ +Note that this is the same concept as gradient accumulation steps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_106.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d433844b6f9e431b4215658d03672f9808b2dbd --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_106.txt @@ -0,0 +1,2 @@ +PyTorch uses chunks, while DeepSpeed refers +to the same hyperparameter as gradient accumulation steps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_107.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f81073e42b99c26687c812a2ea0f880dc0acc0a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_107.txt @@ -0,0 +1 @@ +Because of the chunks, PP introduces the notion of micro-batches (MBS). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_108.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e96da1a5ffad27bd50d46e6c19abbebd089dcc6 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_108.txt @@ -0,0 +1,3 @@ +DP splits the global data batch size into +mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of +256 each (1024/4). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_109.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2178c0a46c680531279137885053663a54dadeb --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_109.txt @@ -0,0 +1 @@ +And if the number of chunks (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_110.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d3c9f784d4f23819efde615fb648ba3e8407908 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_110.txt @@ -0,0 +1,2 @@ +Each +Pipeline stage works with a single micro-batch at a time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_111.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c8f0791738474c1b03c4052f6a79132657a0e2d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_111.txt @@ -0,0 +1,2 @@ +To calculate the global batch size of the DP + PP setup, +use the formula: mbs * chunks * dp_degree (8 * 32 * 4 = 1024). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_112.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f3325f2a04e7fdd5af144c742a7c44c7ffdebed --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_112.txt @@ -0,0 +1 @@ +With chunks=1 you end up with the naive MP, which is inefficient. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_113.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f9068284fe66ab86e4af2a96d765e955a80504a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_113.txt @@ -0,0 +1,2 @@ +With a large chunks value you end up with +tiny micro-batch sizes which is also inefficient. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_114.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..11679431703dc7e2c1c4b2733230573a7e5336af --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_114.txt @@ -0,0 +1,2 @@ +For this reason, we encourage to experiment with the chunks value to +find the one that leads to the most efficient GPUs utilization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_115.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..af86a3896fef55caf26cd261d22723078b60a661 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_115.txt @@ -0,0 +1,2 @@ +You may notice a bubble of "dead" time on the diagram that can't be parallelized because the last forward stage +has to wait for backward to complete the pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_116.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..6492a09bdd5d1eaf540b351b5ff0bc62f47a296e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_116.txt @@ -0,0 +1,2 @@ +The purpose of finding the best value for chunks is to enable a high +concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_117.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..98fa3e6fae1c85f6048888f727e823dae9b6e56d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_117.txt @@ -0,0 +1,6 @@ +Pipeline API solutions have been implemented in: +- PyTorch +- DeepSpeed +- Megatron-LM +These come with some shortcomings: +- They have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a nn.Sequential sequence of the same, which may require changes to the design of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_118.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c48cfd845526ba2a53414d7a68c909f2a60941e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_118.txt @@ -0,0 +1 @@ +- Currently the Pipeline API is very restricted. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_119.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..237386af3d6b3e38429bff353a6b135347812415 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_119.txt @@ -0,0 +1 @@ +If you had a bunch of Python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_120.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..87549034aa819859e4ca36dac51c01be08e729eb --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_120.txt @@ -0,0 +1 @@ +Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_121.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..fab3a59ddcfd85fb17fd84588fbd833cf40dadf9 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_121.txt @@ -0,0 +1 @@ +These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_122.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..45c96edaebcb888ad92a004a0de248cb06756900 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_122.txt @@ -0,0 +1,2 @@ +Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693 +- Conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_123.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9b5d9a3898df9009e893a70a876eb5998d9d077 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_123.txt @@ -0,0 +1 @@ +- They have to arrange each layer so that the output of one layer becomes an input to the other layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_124.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7b7bcb6042d17c58933189a3c90f187b8c3e0a5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_124.txt @@ -0,0 +1,5 @@ +More recent solutions include: +- Varuna +- Sagemaker +We have not experimented with Varuna and SageMaker but their papers report that they have overcome the list of problems +mentioned above and that they require smaller changes to the user's model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_125.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..87abaa3b836ea24201f66df2261d21a043d296ca --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_125.txt @@ -0,0 +1,2 @@ +Implementations: +- PyTorch (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_126.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..268446418d2df8bb51f66ed8b96281e876937bea --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_126.txt @@ -0,0 +1,3 @@ +Some examples +- DeepSpeed +- Megatron-LM has an internal implementation - no API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_127.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..48667b5cc74d548463c22a4663f1df9a0dfc646e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_127.txt @@ -0,0 +1,2 @@ +- Varuna +- SageMaker - this is a proprietary solution that can only be used on AWS. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_128.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..24e24cdc667e9713c1b8b0984bbe82c937268b9c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_128.txt @@ -0,0 +1 @@ +- OSLO - this is implemented based on the Hugging Face Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_129.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..28de7b60e544f56990b20c540da42bce75a58a4e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_129.txt @@ -0,0 +1 @@ +🤗 Transformers status: as of this writing none of the models supports full-PP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_130.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..f345aafacdb19466c3d6cd36f644835766c18637 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_130.txt @@ -0,0 +1 @@ +GPT2 and T5 models have naive MP support. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_131.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0a443f66fea28aef5b2710d07111c31e3fb2093 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_131.txt @@ -0,0 +1 @@ +The main obstacle is being unable to convert the models to nn.Sequential and have all the inputs to be Tensors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_132.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..717e091b151e461e52e3f0f1edfce49bfa858fd6 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_132.txt @@ -0,0 +1,2 @@ +This +is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_133.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..0effbc6afe2b3f92b1d3b019d4f54fb81300c7c4 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_133.txt @@ -0,0 +1,5 @@ +DeepSpeed and Megatron-LM integrations are available in 🤗 Accelerate +Other approaches: +DeepSpeed, Varuna and SageMaker use the concept of an Interleaved Pipeline + +Here the bubble (idle time) is further minimized by prioritizing backward passes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_134.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..3219acec8d198305fd92d49894fafa6d9f334867 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_134.txt @@ -0,0 +1,2 @@ +Varuna further attempts to improve the +schedule by using simulations to discover the most efficient scheduling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_135.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a16ca543532b19ef491904c6b54683538477c51 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_135.txt @@ -0,0 +1 @@ +OSLO has pipeline parallelism implementation based on the Transformers without nn.Sequential conversion. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_136.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..9168173b8d3f8e81c1e0ac569e383aa64cafdd7c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_136.txt @@ -0,0 +1,2 @@ +Tensor Parallelism +In Tensor Parallelism, each GPU processes a slice of a tensor and only aggregates the full tensor for operations requiring it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_137.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6290ed2852dff0d4b3cb05c72fffc1946100ead --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_137.txt @@ -0,0 +1,2 @@ +To describe this method, this section of the guide relies on the concepts and diagrams from the Megatron-LM +paper: Efficient Large-Scale Language Model Training on GPU Clusters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_138.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3cdcb2982f99452c82b8807e8e2631d677a0391 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_138.txt @@ -0,0 +1 @@ +The main building block of any transformer is a fully connected nn.Linear followed by a nonlinear activation GeLU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_139.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..0816a8872d2ed9d831a51ab1b02b784c600359f5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_139.txt @@ -0,0 +1,2 @@ +The dot dot-product part of it, following the Megatron's paper notation, can be written as Y = GeLU(XA), where X is +an input vector, Y is the output vector, and A is the weight matrix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_140.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed2baa6ee9f2a7f785eb523019dcf4b292653bca --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_140.txt @@ -0,0 +1,7 @@ +If we look at the computation in matrix form, you can see how the matrix multiplication can be split between multiple GPUs: + +If we split the weight matrix A column-wise across N GPUs and perform matrix multiplications XA_1 through XA_n in parallel, +then we will end up with N output vectors Y_1, Y_2, , Y_n which can be fed into GeLU independently: + +Using this principle, we can update a multi-layer perceptron of arbitrary depth, without the need for any synchronization +between GPUs until the very end, where we need to reconstruct the output vector from shards. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_141.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..48b20d105b95b804c2fc7a1b7bdd1e3cced6edd1 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_141.txt @@ -0,0 +1,5 @@ +The Megatron-LM paper authors +provide a helpful illustration for that: + +Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having +multiple independent heads! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_142.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5a4dc5d7022de1395bcc3b7e60a9357b2b9d4f0 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_142.txt @@ -0,0 +1 @@ +Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_143.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..00684e31a0761f525844f16a184a40ad99ca639a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_143.txt @@ -0,0 +1 @@ +Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_144.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ae23c6a4ef6da23564012a54e08646c03955cb0 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_144.txt @@ -0,0 +1,2 @@ +If you need a TP degree of 8, you need to use +nodes that have at least 8 GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_145.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..a68041cfae0ecbdf5b2687dfa6067c85d2dedae2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_145.txt @@ -0,0 +1 @@ +This section is based on the original much more detailed TP overview. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_146.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..c301387e8addca430bcbb5e4c84a3028556f5644 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_146.txt @@ -0,0 +1 @@ +by @anton-l. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_147.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..39378518d2575920280020fba079df0de5ebf29c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_147.txt @@ -0,0 +1,6 @@ +Alternative names: +- DeepSpeed calls it tensor slicing +Implementations: +- Megatron-LM has an internal implementation, as it's very model-specific +- parallelformers (only inference at the moment) +- SageMaker - this is a proprietary solution that can only be used on AWS. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_148.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..290dde3e114f00a00bdef5b5ff2cb3dccbf9c945 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_148.txt @@ -0,0 +1 @@ +- OSLO has the tensor parallelism implementation based on the Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_149.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd9980cd0b0dd05f22f5d7e37f8e1288ce7b6a7c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_149.txt @@ -0,0 +1 @@ +SageMaker combines TP with DP for a more efficient processing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_150.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba4a5fc916e98bd6acccf4327580a5ddacef9edc --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_150.txt @@ -0,0 +1,3 @@ +🤗 Transformers status: +- core: not yet implemented in the core +- but if you want inference parallelformers provides this support for most of our models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_151.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..6214910f0e1de43f782536ca8785b33fdd160153 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_151.txt @@ -0,0 +1 @@ +So until this is implemented in the core you can use theirs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_152.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe1f1d28d74848bd619e3f546cafbb6249669430 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_152.txt @@ -0,0 +1 @@ +And hopefully training mode will be supported too. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_153.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..716d54c784505e985ac281179742423b968c3a6e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_153.txt @@ -0,0 +1,2 @@ +- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more here +🤗 Accelerate integrates with TP from Megatron-LM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_154.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..c851881b7417109acb8e906e60c65367bf7ac87a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_154.txt @@ -0,0 +1,3 @@ +Data Parallelism + Pipeline Parallelism +The following diagram from the DeepSpeed pipeline tutorial demonstrates +how one can combine DP with PP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_155.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..16a3242333074e26db2420a23a3ec5e994474e77 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_155.txt @@ -0,0 +1 @@ +Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_156.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cf5bfde9d0e0f61648a770cdefb7352b8573e4f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_156.txt @@ -0,0 +1,2 @@ +To DP there is just GPUs 0 +and 1 where it feeds data as if there were just 2 GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_157.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..2326b66024593e96db8dc7c9b085a6377e9038e3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_157.txt @@ -0,0 +1 @@ +GPU0 "secretly" offloads some of its load to GPU2 using PP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_158.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ee2bdb4fe818cd027cb3e77e83d65e280155df3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_158.txt @@ -0,0 +1 @@ +And GPU1 does the same by enlisting GPU3 to its aid. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_159.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ee634bea2b3fdca4a3474e7f14cd8611c34a8d2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_159.txt @@ -0,0 +1 @@ +Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_160.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..b77c98a8c68efe40396445d54c6810c3e05a3cf0 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_160.txt @@ -0,0 +1,9 @@ +Implementations: +- DeepSpeed +- Megatron-LM +- Varuna +- SageMaker +- OSLO +🤗 Transformers status: not yet implemented +Data Parallelism + Pipeline Parallelism + Tensor Parallelism +To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_161.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..19a9ef28c29da4c15b73c587d85421bb79b4075f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_161.txt @@ -0,0 +1 @@ +This can be seen in the following diagram. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_162.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..d80479659b326b8f07f14b0d67d373ee1340061e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_162.txt @@ -0,0 +1 @@ +This diagram is from a blog post 3D parallelism: Scaling to trillion-parameter models, which is a good read as well. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_163.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bbcedf99f4ea58f06ae435cd779971ea4934f9b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_163.txt @@ -0,0 +1 @@ +Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_164.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ae537cb3e1d990185cecc6388b303fb7002be60 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_164.txt @@ -0,0 +1,2 @@ +Implementations: +- DeepSpeed - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_165.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ce516abe9b05be64e30586cb4934c4b9ed56fef --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_165.txt @@ -0,0 +1,5 @@ +- Megatron-LM +- Varuna +- SageMaker +- OSLO +🤗 Transformers status: not yet implemented, since we have no PP and TP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_166.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..b90be114fe15daa477d1671af666dcdbcbf17217 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_166.txt @@ -0,0 +1,2 @@ +ZeRO Data Parallelism + Pipeline Parallelism + Tensor Parallelism +One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_167.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..583b3fdb67369d816ad4012f07305fae122359b5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_167.txt @@ -0,0 +1,2 @@ +It has already been +discussed in ZeRO Data Parallelism. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_168.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..36ad05591f1ba559ea04940032f2e456fe53fea8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_168.txt @@ -0,0 +1 @@ +Normally it's a standalone feature that doesn't require PP or TP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_169.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b6c6a129fb1a8552e0cb12e838aa662bae3207d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_169.txt @@ -0,0 +1 @@ +But it can be combined with PP and TP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_170.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..31e5e900c1e38129bd60247e5fcb8de2d20f0c86 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_170.txt @@ -0,0 +1 @@ +When ZeRO-DP is combined with PP (and optionally TP) it typically enables only ZeRO stage 1 (optimizer sharding). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_171.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..634bc8295b23410e5bb8ba27a4f36bd4f8a559af --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_171.txt @@ -0,0 +1,2 @@ +While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have negative +performance impacts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_172.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc4d366710eba949acf88d7b205ac68dcc972ded --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_172.txt @@ -0,0 +1,2 @@ +There would need to be an additional reduce-scatter collective for every micro-batch to aggregate +the gradients before sharding, which adds a potentially significant communication overhead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_173.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..3249d1e5cde5ccada02ea1d5f304bef1029e9a27 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_173.txt @@ -0,0 +1,3 @@ +By nature of Pipeline Parallelism, +small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with +minimizing the Pipeline bubble (number of micro-batches). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_174.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..711d3d7ee9d4253b8b2e473094dcef2c271db5a8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_174.txt @@ -0,0 +1 @@ +Therefore those communication costs are going to impact the performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_175.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe00b5f3eb65d235b36832efb478bfa9c6a880a3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_175.txt @@ -0,0 +1 @@ +In addition, there are already fewer layers than normal due to PP and so the memory savings won't be huge. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_176.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..318133e9f334f2e42dc362a2c92bd22a8de55bd8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_176.txt @@ -0,0 +1,2 @@ +PP already +reduces gradient size by 1/PP, and so gradient sharding savings on top of that are less significant than pure DP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_177.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..552b3526ee8f091dc5b9120a06fe4275d08a4c77 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_177.txt @@ -0,0 +1 @@ +ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_178.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a8bf4af46d5c47087277b716717d04725216bb8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_178.txt @@ -0,0 +1 @@ +And since we have ZeRO, the other benefit is ZeRO-Offload. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_179.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..47f5c884335647c366747b99260de0a92fc2703d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_179.txt @@ -0,0 +1 @@ +Since this is stage 1 optimizer states can be offloaded to CPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_180.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..536bba68a6ab30290d50a720ea2f48f89d0dfb30 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_180.txt @@ -0,0 +1,2 @@ +Implementations: +- Megatron-DeepSpeed and Megatron-Deepspeed from BigScience, which is the fork of the former repo. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_181.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..af5912b6e2cb9b0ca9c7d13f7fde224ec4d50452 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_181.txt @@ -0,0 +1,6 @@ +- OSLO +Important papers: + +Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model + +🤗 Transformers status: not yet implemented, since we have no PP and TP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_182.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..c604da69e14308dc44c42e473d018f6acfd8a436 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_182.txt @@ -0,0 +1,2 @@ +FlexFlow +FlexFlow also solves the parallelization problem in a slightly different approach. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_183.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..78890b33ff60862e7dfa27c1cb35480ae010d70c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_183.txt @@ -0,0 +1,2 @@ +Paper: "Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao Jia, Matei Zaharia, Alex Aiken +It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_184.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fa820b26d89107bc99704b5f373921431a98c7c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_184.txt @@ -0,0 +1,8 @@ +Sample = Data Parallelism (sample-wise parallel) +Operator = Parallelize a single operation into several sub-operations +Attribute = Data Parallelism (length-wise parallel) +Parameter = Model Parallelism (regardless of dimension - horizontal or vertical) + +Examples: +* Sample +Let's take 10 batches of sequence length 512. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_185.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..818a57b91b41f931f9b12c68a643a76aaf587e86 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_185.txt @@ -0,0 +1 @@ +If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes be 5 x 2 x 512. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_186.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..72ab1508871d94d5c54a9d33ebbb1b4b1314a61a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_186.txt @@ -0,0 +1,3 @@ +Operator + +If we perform layer normalization, we compute std first and mean second, and then we can normalize data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_187.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..2bbac54ec51b3f3ca5fa6d22325358a1dad7a7d6 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_187.txt @@ -0,0 +1 @@ +Operator parallelism allows computing std and mean in parallel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_188.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..595d8344f22f3fbcf557d9bf1ce5883fd5e47c2b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_188.txt @@ -0,0 +1,2 @@ +So if we parallelize them by operator dimension into 2 +devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_189.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_189.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c1773b8542a532e38c031e47b89900d96666179 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_189.txt @@ -0,0 +1,3 @@ +Attribute + +We have 10 batches of 512 length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_190.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_190.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc6b473c0132523104d9c03e91961d7c43ed72f5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_190.txt @@ -0,0 +1 @@ +If we parallelize them by attribute dimension into 2 devices, 10 x 512 will be 10 x 2 x 256. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_191.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_191.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f9d93d5fd098f96bffc2bb4da541b28f7f82697 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_191.txt @@ -0,0 +1,3 @@ +Parameter + +It is similar with tensor model parallelism or naive layer-wise model parallelism. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_192.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_192.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cae43648cf09e05b7ab8b22e169a3ba078c7fba --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_192.txt @@ -0,0 +1,3 @@ +The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) +fast-intra-connect/slow-inter-connect and it automatically optimizes all these algorithmically deciding which +parallelisation to use where. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_193.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_193.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebaaa7d1a5b9e407883479169967cbcd91cc5b6b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_193.txt @@ -0,0 +1,2 @@ +One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and +fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_194.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_194.txt new file mode 100644 index 0000000000000000000000000000000000000000..060b2cc49d3ee81da0519d4ad938e14a866bec58 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_194.txt @@ -0,0 +1,2 @@ +So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best +strategy to utilise this specific environment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_195.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_195.txt new file mode 100644 index 0000000000000000000000000000000000000000..a213662588881b4cc2891eabd98cb2af500648b0 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_195.txt @@ -0,0 +1,2 @@ +If you add/remove/replace any parts it'll run and re-optimize the plan +for that. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_196.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_196.txt new file mode 100644 index 0000000000000000000000000000000000000000..0af14339c07c35aac4d79e482e1fa143e3d35285 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_196.txt @@ -0,0 +1 @@ +And then you can train. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_197.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_197.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3f10690036cd1297535915138ac9b84db702a32 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_197.txt @@ -0,0 +1 @@ +A different setup will have its own custom optimization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_198.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_198.txt new file mode 100644 index 0000000000000000000000000000000000000000..db4bbc2cc1690c752f6044197668ec9127def098 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_198.txt @@ -0,0 +1,2 @@ +🤗 Transformers status: Transformers models are FX-trace-able via transformers.utils.fx, +which is a prerequisite for FlexFlow, however, changes are required on the FlexFlow side to make it work with Transformers models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_199.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_199.txt new file mode 100644 index 0000000000000000000000000000000000000000..e546c8f424202b1f79fee88e92b5d8f72a77a4e5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_199.txt @@ -0,0 +1,2 @@ +GPU selection +When training on multiple GPUs, you can specify the number of GPUs to use and in what order. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_200.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_200.txt new file mode 100644 index 0000000000000000000000000000000000000000..01c0e6977e1bb40efba9827ab9328d54a70a749b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_200.txt @@ -0,0 +1 @@ +This can be useful for instance when you have GPUs with different computing power and want to use the faster GPU first. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_201.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_201.txt new file mode 100644 index 0000000000000000000000000000000000000000..b44ef19255ce3b459c01e0f9be1d0c20f29e0ef3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_201.txt @@ -0,0 +1 @@ +The selection process works for both DistributedDataParallel and DataParallel to use only a subset of the available GPUs, and you don't need Accelerate or the DeepSpeed integration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_202.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_202.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dfe331fa47c8f211c4a2812f8dc087d5ebf3331 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_202.txt @@ -0,0 +1,4 @@ +Number of GPUs +For example, if you have 4 GPUs and you only want to use the first 2: + +Use the --nproc_per_node to select how many GPUs to use. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_203.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_203.txt new file mode 100644 index 0000000000000000000000000000000000000000..8621c21cc8f2fce87181b2108c4fb58c68132c81 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_203.txt @@ -0,0 +1,3 @@ +torchrun --nproc_per_node=2 trainer-program.py + +Use --num_processes to select how many GPUs to use. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_204.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_204.txt new file mode 100644 index 0000000000000000000000000000000000000000..68c8ee3ab792c5cd487ea4733abd24aa4be9747b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_204.txt @@ -0,0 +1,3 @@ +accelerate launch --num_processes 2 trainer-program.py + +Use --num_gpus to select how many GPUs to use. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_205.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_205.txt new file mode 100644 index 0000000000000000000000000000000000000000..07fd34146385abecf1faab2fe0047c7780d05d1c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_205.txt @@ -0,0 +1,4 @@ +deepspeed --num_gpus 2 trainer-program.py + +Order of GPUs +Now, to select which GPUs to use and their order, you'll use the CUDA_VISIBLE_DEVICES environment variable. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_206.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_206.txt new file mode 100644 index 0000000000000000000000000000000000000000..a873c4e2dcbfd84ce33abbf939a1203659312100 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_206.txt @@ -0,0 +1 @@ +It is easiest to set the environment variable in a ~/bashrc or another startup config file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_207.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_207.txt new file mode 100644 index 0000000000000000000000000000000000000000..d82158748a34d4db7a2e17ca50832ffad4463f4d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_207.txt @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES is used to map which GPUs are used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_208.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_208.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d3d27269994f62a0ad7dc3cd94246101663d527 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_208.txt @@ -0,0 +1,4 @@ +For example, if you have 4 GPUs (0, 1, 2, 3) and you only want to run GPUs 0 and 2: + +CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py +Only the 2 physical GPUs (0 and 2) are "visible" to PyTorch and these are mapped to cuda:0 and cuda:1 respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_209.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_209.txt new file mode 100644 index 0000000000000000000000000000000000000000..36b2a49e266f3f6bc300f866e45bac5a158fea83 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_209.txt @@ -0,0 +1 @@ +You can also reverse the order of the GPUs to use 2 first. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_210.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_210.txt new file mode 100644 index 0000000000000000000000000000000000000000..086de8d5f4a2e511c2ac6e62c7fba45beee1a824 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_210.txt @@ -0,0 +1 @@ +Now, the mapping is cuda:1 for GPU 0 and cuda:0 for GPU 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_211.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_211.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6eec6bacb07ab3c6c2c022531279a983c7940d5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_211.txt @@ -0,0 +1,2 @@ +CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py +You can also set the CUDA_VISIBLE_DEVICES environment variable to an empty value to create an environment without GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_212.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_212.txt new file mode 100644 index 0000000000000000000000000000000000000000..5510f610ea767e99b61ca15f52a08afb66a341ed --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_212.txt @@ -0,0 +1,3 @@ +CUDA_VISIBLE_DEVICES= python trainer-program.py + +As with any environment variable, they can be exported instead of being added to the command line. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_213.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_213.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dd920fc759777147b724897ac048c005ef410f0 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_213.txt @@ -0,0 +1 @@ +However, this is not recommended because it can be confusing if you forget how the environment variable was setup and you end up using the wrong GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_214.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_214.txt new file mode 100644 index 0000000000000000000000000000000000000000..83f6e07abed80bbc23d63ae3037189b247fd732e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_214.txt @@ -0,0 +1 @@ +Instead, it is common practice to set the environment variable for a specific training run on the same command line. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_215.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_215.txt new file mode 100644 index 0000000000000000000000000000000000000000..000745f5f4e14769c9b7c735bb13fff73a6ac8ba --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_215.txt @@ -0,0 +1 @@ +CUDA_DEVICE_ORDER is an alternative environment variable you can use to control how the GPUs are ordered. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_216.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_216.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5ef8df5c9787340dcd953344a3ada63f9cedd74 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_216.txt @@ -0,0 +1,10 @@ +You can either order them by: + +PCIe bus ID's that matches the order of nvidia-smi and rocm-smi for NVIDIA and AMD GPUs respectively + +export CUDA_DEVICE_ORDER=PCI_BUS_ID + +GPU compute ability + +export CUDA_DEVICE_ORDER=FASTEST_FIRST +The CUDA_DEVICE_ORDER is especially useful if your training setup consists of an older and newer GPU, where the older GPU appears first, but you cannot physically swap the cards to make the newer GPU appear first. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_217.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_217.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b68a4b83eb869f2e7c47f3fea4b136e8bf32671 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_217.txt @@ -0,0 +1 @@ +In this case, set CUDA_DEVICE_ORDER=FASTEST_FIRST to always use the newer and faster GPU first (nvidia-smi or rocm-smi still reports the GPUs in their PCIe order). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_218.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_218.txt new file mode 100644 index 0000000000000000000000000000000000000000..85ae09dc209f279604b6cb3e95b366ce510a90d3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_218.txt @@ -0,0 +1 @@ +Or you could also set export CUDA_VISIBLE_DEVICES=1,0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_62.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..aba22e1e15b8c5a0b0ab7d71d9aae0ccd853ca05 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_62.txt @@ -0,0 +1 @@ +What happens at this point? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_63.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5f6c0983746c3d4e6fbee2a2170c0854bc7fb78 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_63.txt @@ -0,0 +1 @@ +On GPU0: the x0 mini-batch requires the a0, a1, a2 parameters to do its forward path through the layer, but the GPU0 has only a0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_64.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..6db028a6da99c2cd906e74df7b790290ec8584f6 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_64.txt @@ -0,0 +1 @@ +It will get a1 from GPU1 and a2 from GPU2, bringing all the pieces of the model together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_65.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..539d3562361ef65d6c27f46e34c01d78b84d835e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_65.txt @@ -0,0 +1 @@ +In parallel, GPU1 gets another mini-batch - x1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_66.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cdf4c2c1bcb3de644230dbf8bc5d199b5a44509 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_66.txt @@ -0,0 +1 @@ +GPU1 has the a1 parameter, but needs a0 and a2, so it gets those from GPU0 and GPU2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_67.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..0389b9f37bd1c3948f6ca684d8539941201d880b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_67.txt @@ -0,0 +1 @@ +Same happens to GPU2 that gets the mini-batch x2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_68.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e8f4f8ff32b8085d19d29a74225d1cc369d26c9 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_68.txt @@ -0,0 +1 @@ +It gets a0 and a1 from GPU0 and GPU1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_69.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..833c72154b7dc8dad2d63ca75055de3b6011824b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_69.txt @@ -0,0 +1 @@ +This way each of the 3 GPUs gets the full tensors reconstructed and makes a forward pass with its own mini-batch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_70.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3b950b81f0b5d6cb54e4b5d08aec78c9c86933c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_70.txt @@ -0,0 +1 @@ +As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_71.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d99aea92a1aa240e2a9e57634ff891ab1554e9f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_71.txt @@ -0,0 +1 @@ +The reconstruction is done efficiently via a pre-fetch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_72.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dbc0e90d162e2b71b3ff93325b74135b0e55611 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_72.txt @@ -0,0 +1 @@ +Then the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_73.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..019d317a7ffc7208e15d0389e6a9d1dadbc2d131 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_73.txt @@ -0,0 +1,2 @@ +This mechanism is similar to an efficient group backpacking strategy: person A carries the tent, person B carries the stove, +and person C carries the axe. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_74.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecfd820e791e2e8ab8e5b5c32132b6f95f3e882c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_74.txt @@ -0,0 +1,2 @@ +Each night they all share what they have with others and get from others what they don't have, +and in the morning they pack up their allocated type of gear and continue on their way. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_75.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ef711f8e1e2bc0d389de73b6cf640744c19bfb7 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_75.txt @@ -0,0 +1 @@ +This is what ZeRO DP/Sharded DDP is. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_76.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..442c1b5d426ded0bb74cfbcaa9232b9ac1212c55 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_76.txt @@ -0,0 +1,2 @@ +Compare this strategy to the simple one where each person has to carry their own tent, stove and axe (similar to +DataParallel (DP and DDP) in PyTorch), which would be far more inefficient. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_77.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..272066f7062362fe30af3fe57a97ef7e0a3a7368 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_77.txt @@ -0,0 +1 @@ +While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_78.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd0aa3ea3bac6da326840baa67674492d61104e3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_78.txt @@ -0,0 +1,2 @@ +If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism +which will be discussed later. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_79.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0523e76bd8c53859e88b6259eb42d83d6dc99be --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_79.txt @@ -0,0 +1,2 @@ +This is because it partitions/shards each layer's weights, unlike vertical model parallelism +which is discussed next. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_80.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..7060ffd30fc453a5f03833b5c6e894e1f155b0a3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_80.txt @@ -0,0 +1,8 @@ +Implementations: + +DeepSpeed ZeRO-DP stages 1+2+3 +Accelerate integration +transformers integration + +From Naive Model Parallelism to Pipeline Parallelism +To explain Pipeline parallelism, we'll first look into Naive Model Parallelism (MP), also known as Vertical MP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_81.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..977bfceac8ff46b7ac93e6d61ae085b0849421d2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_81.txt @@ -0,0 +1,2 @@ +This approach +involves distributing groups of model layers across multiple GPUs by assigning specific layers to specific GPUs with .to(). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_82.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a90908814d035216b35e098444b01dc726ef489 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_82.txt @@ -0,0 +1 @@ +As data flows through these layers, it is moved to the same GPU as the layer, while the other layers remain untouched. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_83.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..93913bea719e52f7b2afed8241688717bc021c55 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_83.txt @@ -0,0 +1 @@ +We refer to this Model parallelism as "Vertical" because of how models are typically visualized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_84.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..622aab1511f1c6e69f73903662f9eda9648b46d3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_84.txt @@ -0,0 +1,18 @@ +For example, the +following diagram shows an 8-layer model split vertically into two slices, placing layers 0-3 onto +GPU0 and 4-7 to GPU1: + +| Layer | | +| 0 | | +| 1 | GPU0 | +| 2 | | +| 3 | | +================ +| Layer | | +| 4 | | +| 5 | GPU1 | +| 6 | | +| 7 | | +================ + +In this example, when data moves from layer 0 to 3, it's no different from regular forward pass. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_85.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f3110610fa6d0064433c4059197dbc13005e0d1 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_85.txt @@ -0,0 +1,2 @@ +However, passing data +from layer 3 to 4 requires moving it from GPU0 to GPU1, introducing a communication overhead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_86.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b338ea7355601c4d3f628bcc619efa0aa31ee6c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_86.txt @@ -0,0 +1,2 @@ +If the participating +GPUs are on the same compute node (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_87.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..efaff7860efa64accb18bd6782a8341d68b36c55 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_87.txt @@ -0,0 +1,2 @@ +same physical machine) this copying is fast, but if the GPUs are distributed +across different compute nodes (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_88.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c0edcfe6bf76e6bd2fc045e8f4bac1d38116461 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_88.txt @@ -0,0 +1 @@ +multiple machines), the communication overhead could be substantially greater. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_89.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2fb2afd7accc56ae1a077be7496b670fd885d92 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_89.txt @@ -0,0 +1 @@ +Following that, layers 4 to 7 work as they would in the original model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_90.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..01bbbb075da09126b3a0bdbb4bb34f6bdc32d3ae --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_90.txt @@ -0,0 +1,2 @@ +Upon completion of the 7th layer, there is often +a need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_91.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..1897dd5b8ea6edacfcfa6691682658ccaae2dd43 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_91.txt @@ -0,0 +1,2 @@ +Now the loss can be +computed and the optimizer can do its work. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_92.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..d579be6ebe100c71a91853c1544d96a795f18619 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_92.txt @@ -0,0 +1,2 @@ +Naive Model Parallelism comes several shortcomings: +- All but one GPU are idle at any given moment: if 4 GPUs are used, it's nearly identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_93.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..a04083f5df58efb9e6549676d13d9d607bff4cb9 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_93.txt @@ -0,0 +1 @@ +- Overhead in data transfer between devices: E.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_94.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..3156a31873235dbba64b03c99e2864fbd5e26967 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_94.txt @@ -0,0 +1 @@ +4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, but a single 24GB card will complete the training faster, because it doesn't have the data copying overhead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_95.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfad266409385a534b8d9389b35d45067575374c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_95.txt @@ -0,0 +1,2 @@ +But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states) +- Copying shared embeddings: Shared embeddings may need to get copied back and forth between GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_96.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d37cb004f9192b3f55c349f9a24bfade36ac621 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_96.txt @@ -0,0 +1 @@ +Now that you are familiar with how the naive approach to model parallelism works and its shortcomings, let's look at Pipeline Parallelism (PP). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_97.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bc1e56e782533271807afb15fdd3b4af3d70c04 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_97.txt @@ -0,0 +1,2 @@ +PP is almost identical to a naive MP, but it solves the GPU idling problem by chunking the incoming batch into micro-batches +and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_98.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..f81eb5e19e936a673b0d9ae18e7a451d0d5cdd4f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_98.txt @@ -0,0 +1,5 @@ +The following illustration from the GPipe paper +shows the naive MP on the top, and PP on the bottom: + +At the bottom of the diagram, you can observe that the Pipeline Parallelism (PP) approach minimizes the number of idle +GPU zones, referred to as 'bubbles'. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_99.txt b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d1846f1274592942a1c22f353837bce2e4d5682 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_many/chunk_99.txt @@ -0,0 +1,2 @@ +Both parts of the diagram show a parallelism level of degree 4, meaning that 4 GPUs +are involved in the pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_100.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..68388f339816a3a9a0b99a9b4184f090d62d5828 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_100.txt @@ -0,0 +1 @@ +It should eventually become the default, but if you want to experiment with it sooner, take a look at this GitHub issue. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_101.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..47b7ef749af14d251e064e30e0cf80e97259c165 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_101.txt @@ -0,0 +1,3 @@ +Data preloading +One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it +can handle. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_102.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..601bc1b57240f12747270cc8bf8a5f5fbaf0f4bd --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_102.txt @@ -0,0 +1,2 @@ +By default, everything happens in the main process, and it might not be able to read the data from disk fast +enough, and thus create a bottleneck, leading to GPU under-utilization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_103.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e1bdd043853ac62e19514bc064874bb68c5cb5c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_103.txt @@ -0,0 +1,3 @@ +Configure the following arguments to reduce the bottleneck: + +DataLoader(pin_memory=True, ) - ensures the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_104.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bc7f399492ff8cdc21186b14e87908d363507fc --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_104.txt @@ -0,0 +1 @@ +DataLoader(num_workers=4, ) - spawn several workers to preload data faster. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_105.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..59e16fa53ad5f99a0342518deea80db069fbb304 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_105.txt @@ -0,0 +1 @@ +During training, watch the GPU utilization stats; if it's far from 100%, experiment with increasing the number of workers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_106.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4041c2775f5397f0a8fa2c94ae482c85feefc1a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_106.txt @@ -0,0 +1 @@ +Of course, the problem could be elsewhere, so many workers won't necessarily lead to better performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_107.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..f21c37a226b34eb52057b9ed01ba37eee116efbc --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_107.txt @@ -0,0 +1 @@ +When using [Trainer], the corresponding [TrainingArguments] are: dataloader_pin_memory (True by default), and dataloader_num_workers (defaults to 0). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_108.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..f159fa0dc0b2f4704eb57f2015bc20b4e357b3d3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_108.txt @@ -0,0 +1,2 @@ +DeepSpeed ZeRO +DeepSpeed is an open-source deep learning optimization library that is integrated with 🤗 Transformers and 🤗 Accelerate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_109.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..f195a6fa4b16bc8a7c281d2027db41773bb09891 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_109.txt @@ -0,0 +1,2 @@ +It provides a wide range of features and optimizations designed to improve the efficiency and scalability of large-scale +deep learning training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_110.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..47eb4d29d4cf8d82615717692203eceb0a4cd9b4 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_110.txt @@ -0,0 +1,2 @@ +If your model fits onto a single GPU and you have enough space to fit a small batch size, you don't need to use DeepSpeed +as it'll only slow things down. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_111.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..16705c5925d815aa9d2812576221ff94d41a0a26 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_111.txt @@ -0,0 +1,2 @@ +However, if the model doesn't fit onto a single GPU or you can't fit a small batch, you can +leverage DeepSpeed ZeRO + CPU Offload, or NVMe Offload for much larger models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_112.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..9891cbae78776f0bacbe32568c6785fcbbb0c8dd --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_112.txt @@ -0,0 +1,6 @@ +In this case, you need to separately +install the library, then follow one of the guides to create a configuration file +and launch DeepSpeed: + +For an in-depth guide on DeepSpeed integration with [Trainer], review the corresponding documentation, specifically the +section for a single GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_113.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..428b9af6249ca3dd4f4e816375c504e2472f6bb2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_113.txt @@ -0,0 +1 @@ +Some adjustments are required to use DeepSpeed in a notebook; please take a look at the corresponding guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_114.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a2b118230c77a4758898205f17371210ea3cc1d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_114.txt @@ -0,0 +1 @@ +If you prefer to use 🤗 Accelerate, refer to 🤗 Accelerate DeepSpeed guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_115.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..82f78e422046df3c2020ac85d73df2f22ea37b93 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_115.txt @@ -0,0 +1,3 @@ +Using torch.compile +PyTorch 2.0 introduced a new compile function that doesn't require any modification to existing PyTorch code but can +optimize your code by adding a single line of code: model = torch.compile(model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_116.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5e68885eb31f1ab3d42b67f85e43e08b0fab301 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_116.txt @@ -0,0 +1,4 @@ +If using [Trainer], you only need to pass the torch_compile option in the [TrainingArguments]: +python +training_args = TrainingArguments(torch_compile=True, **default_args) +torch.compile uses Python's frame evaluation API to automatically create a graph from existing PyTorch programs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_117.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f8b7854546bfa38f0123714aba668083bc43593 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_117.txt @@ -0,0 +1,2 @@ +After +capturing the graph, different backends can be deployed to lower the graph to an optimized engine. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_118.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..9eda94b608db104b4656181670f27c4f786198b7 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_118.txt @@ -0,0 +1 @@ +You can find more details and benchmarks in PyTorch documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_119.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..5147d45dde75f5e922473a9a371f1dabc12ae267 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_119.txt @@ -0,0 +1 @@ +torch.compile has a growing list of backends, which can be found in by calling torchdynamo.list_backends(), each of which with its optional dependencies. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_120.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..98a8a9950c0aabff4b6e73f73ca8a2306217c903 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_120.txt @@ -0,0 +1 @@ +Choose which backend to use by specifying it via torch_compile_backend in the [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_121.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a8e6000e61dc2e1f08640c0a691c5949aa1910e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_121.txt @@ -0,0 +1,3 @@ +Some of the most commonly used backends are: +Debugging backends: +* dynamo.optimize("eager") - Uses PyTorch to run the extracted GraphModule. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_122.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab544edaadae56fe34ed437dae67bb994660af93 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_122.txt @@ -0,0 +1 @@ +This is quite useful in debugging TorchDynamo issues. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_123.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..50c17e2a36b9af89f3072aef204dfa9c125c9912 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_123.txt @@ -0,0 +1 @@ +* dynamo.optimize("aot_eager") - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's extracted forward and backward graphs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_124.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe3773f9a0995409d18fa7e5bdc7108cdc1b0415 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_124.txt @@ -0,0 +1 @@ +This is useful for debugging, and unlikely to give speedups. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_125.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..7097b272f81cd247fe5cb7694d34a13451295dd1 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_125.txt @@ -0,0 +1,3 @@ +Training & inference backends: +* dynamo.optimize("inductor") - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels Read more +* dynamo.optimize("nvfuser") - nvFuser with TorchScript. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_126.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..485996eeb46f8cffc64b88d9aab24ebade78a0e8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_126.txt @@ -0,0 +1,2 @@ +Read more +* dynamo.optimize("aot_nvfuser") - nvFuser with AotAutograd. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_127.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..1207ffa4a4991544cf5c28ffafb7c36b5884c0f8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_127.txt @@ -0,0 +1,2 @@ +Read more +* dynamo.optimize("aot_cudagraphs") - cudagraphs with AotAutograd. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_128.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..136888d80218406711af457df6c88c7bd0e6c1fc --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_128.txt @@ -0,0 +1,3 @@ +Read more +Inference-only backends: +* dynamo.optimize("ofi") - Uses Torchscript optimize_for_inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_129.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b17697657d31c392fd5bd1a4d43984c6ed52a69 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_129.txt @@ -0,0 +1,2 @@ +Read more +* dynamo.optimize("fx2trt") - Uses NVIDIA TensorRT for inference optimizations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_130.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc2699b612c999623071e02f13c66d8f0913f934 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_130.txt @@ -0,0 +1,2 @@ +Read more +* dynamo.optimize("onnxrt") - Uses ONNXRT for inference on CPU/GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_131.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c587dd97c1a076c03f38522758cf5dae0cfe202 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_131.txt @@ -0,0 +1,2 @@ +Read more +* dynamo.optimize("ipex") - Uses IPEX for inference on CPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_132.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..44a7a59fd4092cbc93a291ca53c9b241438612c7 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_132.txt @@ -0,0 +1,4 @@ +Read more +For an example of using torch.compile with 🤗 Transformers, check out this blog post on fine-tuning a BERT model for Text Classification using the newest PyTorch 2.0 features +Using 🤗 PEFT +Parameter-Efficient Fine Tuning (PEFT) methods freeze the pretrained model parameters during fine-tuning and add a small number of trainable parameters (the adapters) on top of it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_133.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..66d05d6ace0027e53674bc7e6a38905a07711bf9 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_133.txt @@ -0,0 +1 @@ +As a result the memory associated to the optimizer states and gradients are greatly reduced. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_134.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff65a1c836c3a679b50ad68119fc0620a2e10248 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_134.txt @@ -0,0 +1,5 @@ +For example with a vanilla AdamW, the memory requirement for the optimizer state would be: +* fp32 copy of parameters: 4 bytes/param +* Momentum: 4 bytes/param +* Variance: 4 bytes/param +Suppose a model with 7B parameters and 200 millions parameters injected with Low Rank Adapters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_135.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..8be3b2ac985f50f84d3d789277fc8e98c3b4ac83 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_135.txt @@ -0,0 +1 @@ +The memory requirement for the optimizer state of the plain model would be 12 * 7 = 84 GB (assuming 7B trainable parameters). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_136.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..eeb92ee8bacd414d60c298d0a079de9ff3907d7a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_136.txt @@ -0,0 +1 @@ +Adding Lora increases slightly the memory associated to the model weights and substantially decreases memory requirement for the optimizer state to 12 * 0.2 = 2.4GB. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_137.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..8827cd7d53bd45418be22c4787395a97c670d05b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_137.txt @@ -0,0 +1 @@ +Read more about PEFT and its detailed usage in the PEFT documentation or PEFT repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_138.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..d12ea538d26afc545311d675fafc3a8838c184b0 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_138.txt @@ -0,0 +1,3 @@ +Using 🤗 Accelerate +With 🤗 Accelerate you can use the above methods while gaining full +control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_139.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a44e840ebdc3bb8689daa82d80bbf680bbd3fef --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_139.txt @@ -0,0 +1,28 @@ +Suppose you have combined the methods in the [TrainingArguments] like so: +py +training_args = TrainingArguments( + per_device_train_batch_size=1, + gradient_accumulation_steps=4, + gradient_checkpointing=True, + fp16=True, + **default_args, +) +The full example training loop with 🤗 Accelerate is only a handful of lines of code long: + +from accelerate import Accelerator +from torch.utils.data.dataloader import DataLoader +dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size) +if training_args.gradient_checkpointing: + model.gradient_checkpointing_enable() +accelerator = Accelerator(fp16=training_args.fp16) +model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader) +model.train() +for step, batch in enumerate(dataloader, start=1): + loss = model(**batch).loss + loss = loss / training_args.gradient_accumulation_steps + accelerator.backward(loss) + if step % training_args.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() + +First we wrap the dataset in a DataLoader. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_140.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..5222f55009dd53c6f75941e7041fa69f64e72c35 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_140.txt @@ -0,0 +1 @@ +Then we can enable gradient checkpointing by calling the model's [~PreTrainedModel.gradient_checkpointing_enable] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_141.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..79869703de0e4d0496a294364eac6c704cd4efad --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_141.txt @@ -0,0 +1,2 @@ +When we initialize the Accelerator +we can specify if we want to use mixed precision training and it will take care of it for us in the [prepare] call. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_142.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8dfc6f22f3dba0949fc5d2acae860e6760d52c3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_142.txt @@ -0,0 +1,2 @@ +During the prepare +call the dataloader will also be distributed across workers should we use multiple GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_143.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..082aa634c63ee3d78ccd6f95576f65873f9946d7 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_143.txt @@ -0,0 +1 @@ +We use the same 8-bit optimizer from the earlier example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_144.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca829159eb852ac8dd486db27a90383a3bc49437 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_144.txt @@ -0,0 +1 @@ +Finally, we can add the main training loop. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_145.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..af269a8c767e81cf774d1c9b2182a85ffbdcc2a9 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_145.txt @@ -0,0 +1 @@ +Note that the backward call is handled by 🤗 Accelerate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_146.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd357f2f35af88ce1b2c089d553a6d881a780b40 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_146.txt @@ -0,0 +1,3 @@ +We can also see +how gradient accumulation works: we normalize the loss, so we get the average at the end of accumulation and once we have +enough steps we run the optimization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_147.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..632b4670f03a551515cd44aa93f6c375af24eb70 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_147.txt @@ -0,0 +1,2 @@ +Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the +benefit of more flexibility in the training loop. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_148.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..018e15144ddf3e4e24ddac8e1a6f91e18421e091 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_148.txt @@ -0,0 +1,2 @@ +For a full documentation of all features have a look at the +Accelerate documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_149.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa137559f6b396a5ee302e7950ed7d950132f79c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_149.txt @@ -0,0 +1,3 @@ +Efficient Software Prebuilds +PyTorch's pip and conda builds come prebuilt with the cuda toolkit +which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_150.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a3c8abfb291bddf168cae86ab20211877d49c9a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_150.txt @@ -0,0 +1 @@ +At times, additional efforts may be required to pre-build some components. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_151.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbaae54ad515d7652d869e6df41b7c7f2f72d451 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_151.txt @@ -0,0 +1,2 @@ +For instance, if you're using libraries like apex that +don't come pre-compiled. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_152.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..129dafa84d824e42d991c877dc01e86010291dc4 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_152.txt @@ -0,0 +1 @@ +In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_153.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..a168266e820aa4494abc6b2187511ee752130474 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_153.txt @@ -0,0 +1,2 @@ +To address these scenarios PyTorch and NVIDIA released a new version of NGC docker container which already comes with +everything prebuilt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_154.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..35c60f8e493bef4650cde888bd8e2d117543103c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_154.txt @@ -0,0 +1 @@ +You just need to install your programs on it, and it will run out of the box. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_155.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a87c038f3e97f78fe616639fd73b9d76f6cd093 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_155.txt @@ -0,0 +1 @@ +This approach is also useful if you want to tweak the pytorch source and/or make a new customized build. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_156.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..05dcee7485912f6095d6c5edc5eb5de69313ee1e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_156.txt @@ -0,0 +1,2 @@ +To find the docker image version you want start with PyTorch release notes, +choose one of the latest monthly releases. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_157.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8841fe29e8d97125cf37f47c6edf00c0155351e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_157.txt @@ -0,0 +1,2 @@ +Go into the release's notes for the desired release, check that the environment's +components are matching your needs (including NVIDIA Driver requirements!) \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_158.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..43972ed72ed7ff75682dd6ac5c5822ec9beddf5e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_158.txt @@ -0,0 +1,2 @@ +and then at the very top of that document go +to the corresponding NGC page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_159.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..86cd41744b35908301ee40e842b8255747967747 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_159.txt @@ -0,0 +1 @@ +If for some reason you get lost, here is the index of all PyTorch NGC images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_160.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..00c25196874c1b6078fb51bd73b1d7b221e7abc5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_160.txt @@ -0,0 +1 @@ +Next follow the instructions to download and deploy the docker image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_161.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab87e2e0c00dc38c22d789ce458a185f1f0b6148 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_161.txt @@ -0,0 +1,3 @@ +Mixture of Experts +Some recent papers reported a 4-5x training speedup and a faster inference by integrating +Mixture of Experts (MoE) into the Transformer models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_162.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..473b259e4c3cf598ca3c6d26900042b094061d79 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_162.txt @@ -0,0 +1,2 @@ +Since it has been discovered that more parameters lead to better performance, this technique allows to increase the +number of parameters by an order of magnitude without increasing training costs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_163.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..86b93f9a1f71b4d4618f6c9bfe7befb02f6d0d2d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_163.txt @@ -0,0 +1,2 @@ +In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function +that trains each expert in a balanced way depending on the input token's position in a sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_164.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..85c4248bb4fd1109969bd973ddf4b25ebbb423a7 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_164.txt @@ -0,0 +1,2 @@ +(source: GLAM) +You can find exhaustive details and comparison tables in the papers listed at the end of this section. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_165.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..e52fbd3a589c18f6cbd8c2b8a86e476cc67e0016 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_165.txt @@ -0,0 +1,2 @@ +The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude +larger than its dense equivalent. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_166.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..e34a53b667bc61d6546c6aa86c0e755908f2c72a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_166.txt @@ -0,0 +1 @@ +Various distillation and approaches are proposed to how to overcome the much higher memory requirements. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_167.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..3cf863fd4c14e9a04522063a8bdb5750b307380c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_167.txt @@ -0,0 +1,3 @@ +There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or +hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the +memory requirements moderately as well. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_168.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..38c41db350d782bde7c20629ec30ed3cf39e2b7e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_168.txt @@ -0,0 +1,7 @@ +Most related papers and implementations are built around Tensorflow/TPUs: + +GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding +Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity +GLaM: Generalist Language Model (GLaM) + +And for Pytorch DeepSpeed has built one as well: DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale, Mixture of Experts - blog posts: 1, 2 and specific deployment with large transformer-based natural language generation models: blog post, Megatron-Deepspeed branch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_169.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2ccf3e841c636e334a29778ff2e5a8829fe8ae4 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_169.txt @@ -0,0 +1,3 @@ +Using PyTorch native attention and Flash Attention +PyTorch 2.0 released a native torch.nn.functional.scaled_dot_product_attention (SDPA), +that allows using fused GPU kernels such as memory-efficient attention and flash attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_170.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..6879cec16275075331ebbcf87ad914d6b13c767e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_170.txt @@ -0,0 +1,5 @@ +After installing the optimum package, the relevant internal modules can be +replaced to use PyTorch's native attention with: +python +model = model.to_bettertransformer() +Once converted, train the model as usual. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_171.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e92ffe610162b14e8b74f86fa637baef1c31bac --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_171.txt @@ -0,0 +1 @@ +The PyTorch-native scaled_dot_product_attention operator can only dispatch to Flash Attention if no attention_mask is provided. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_172.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..5225e961c28cd70c8361d1c40b6bf273a636a49d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_172.txt @@ -0,0 +1 @@ +By default, in training mode, the BetterTransformer integration drops the mask support and can only be used for training that does not require a padding mask for batched training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_173.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..28d0e2bc324eaafdd47558544e54f7881ffd543a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_173.txt @@ -0,0 +1 @@ +This is the case, for example, during masked language modeling or causal language modeling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_174.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fe22cd3590e2c637a4894d6bc6a41f61e85c563 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_174.txt @@ -0,0 +1 @@ +BetterTransformer is not suited for fine-tuning models on tasks that require a padding mask. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_175.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..82440bbed8a91eecaede9cd7129bd53f7a396ba2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_175.txt @@ -0,0 +1 @@ +Check out this blogpost to learn more about acceleration and memory-savings with SDPA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_58.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..4856dc3f031bcc47c72e1f518c187d2a8c09c461 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_58.txt @@ -0,0 +1,2 @@ +BF16 +If you have access to an Ampere or newer hardware you can use bf16 for mixed precision training and evaluation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_59.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..c371b73a7de4fb45f21ceb6d0a1055d3697202c3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_59.txt @@ -0,0 +1,2 @@ +While +bf16 has a worse precision than fp16, it has a much bigger dynamic range. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_60.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..97073c879b0ad02ba0c69df15bdc74880ae4c4b4 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_60.txt @@ -0,0 +1,2 @@ +In fp16 the biggest number you can have +is 65535 and any number above that will result in an overflow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_61.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..71a32f8450e009e42d34b515ea5d89ee95855343 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_61.txt @@ -0,0 +1 @@ +A bf16 number can be as large as 3.39e+38 (!) \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_62.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..e89015e61dfe5e55f388ec96cb1f44f932231475 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_62.txt @@ -0,0 +1,2 @@ +which +is about the same as fp32 - because both have 8-bits used for the numerical range. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_63.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f252f1ee500b166ab11c519fdd7bb8e7301fbb4 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_63.txt @@ -0,0 +1,5 @@ +You can enable BF16 in the 🤗 Trainer with: +python +training_args = TrainingArguments(bf16=True, **default_args) +TF32 +The Ampere hardware uses a magical data type called tf32. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_64.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..58b5c49c4288eec145e72092fc12fef1682d25dd --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_64.txt @@ -0,0 +1,2 @@ +It has the same numerical range as fp32 (8-bits), but instead +of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_65.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d5db5233b9c7ac08898a4b4e2f671fff87bc13f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_65.txt @@ -0,0 +1,3 @@ +It's "magical" in the sense that +you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput +improvement. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_66.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..31f36788bb7fb09aae696da4776618f5d301c8db --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_66.txt @@ -0,0 +1,6 @@ +All you need to do is to add the following to your code: +python +import torch +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True +CUDA will automatically switch to using tf32 instead of fp32 where possible, assuming that the used GPU is from the Ampere series. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_67.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a28ff765451ac40df86030ede73f6b2e47bf106 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_67.txt @@ -0,0 +1,2 @@ +According to NVIDIA research, the +majority of machine learning training workloads show the same perplexity and convergence with tf32 training as with fp32. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_68.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..e41b32c42f45f5132d2b9be0b601ab24f5cf6f34 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_68.txt @@ -0,0 +1 @@ +If you're already using fp16 or bf16 mixed precision it may help with the throughput as well. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_69.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..b622ddaceafde63c155a998cf67a09fc6e897f43 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_69.txt @@ -0,0 +1,5 @@ +You can enable this mode in the 🤗 Trainer: +python +TrainingArguments(tf32=True, **default_args) + +tf32 can't be accessed directly via tensor.to(dtype=torch.tf32) because it is an internal CUDA data type. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_70.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0ceabe9bd055e5cf33ce692514100c742c565e2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_70.txt @@ -0,0 +1 @@ +You need torch>=1.7 to use tf32 data types. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_71.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..2acc4b257073c4d1b429ad547eaa7158ece6d24d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_71.txt @@ -0,0 +1,3 @@ +For additional information on tf32 vs other precisions, please refer to the following benchmarks: +RTX-3090 and +A100. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_72.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..f324d9e7b2f4558392d16f7e8a2a0678320991cc --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_72.txt @@ -0,0 +1,2 @@ +Flash Attention 2 +You can speedup the training throughput by using Flash Attention 2 integration in transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_73.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..46e539336c5f8ddb503b8ea37ce1355e4aeffef0 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_73.txt @@ -0,0 +1 @@ +Check out the appropriate section in the single GPU section to learn more about how to load a model with Flash Attention 2 modules. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_74.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fddb870ab02f6a093c56520343a2bfe2a2bd93e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_74.txt @@ -0,0 +1,2 @@ +Optimizer choice +The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_75.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..e728cf1baaec3a84ab4be1faa4dfd3d586095ebb --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_75.txt @@ -0,0 +1,3 @@ +Adam achieves +good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory +footprint of the order of the number of model parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_76.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6b33cae72829a11cedd9687e7445f57562b8473 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_76.txt @@ -0,0 +1 @@ +To remedy this, you can use an alternative optimizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_77.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..efe425b9f8b8f07037635821f1f6314eb97ae8e4 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_77.txt @@ -0,0 +1,2 @@ +For example if you have NVIDIA/apex installed for NVIDIA GPUs, or ROCmSoftwarePlatform/apex for AMD GPUs, adamw_apex_fused will give you the +fastest training experience among all supported AdamW optimizers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_78.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b125e53ffa7fe60f2f37c8b18b6ea7b0361adb3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_78.txt @@ -0,0 +1,2 @@ +[Trainer] integrates a variety of optimizers that can be used out of box: adamw_hf, adamw_torch, adamw_torch_fused, +adamw_apex_fused, adamw_anyprecision, adafactor, or adamw_bnb_8bit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_79.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..c46311b241db9e76f7ea9d867f51fdedf385b307 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_79.txt @@ -0,0 +1 @@ +More optimizers can be plugged in via a third-party implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_80.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..86350d7ef6eb4f4f3fe5b4058bd1edb2c49ac34e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_80.txt @@ -0,0 +1,3 @@ +Let's take a closer look at two alternatives to AdamW optimizer: +1. adafactor which is available in [Trainer] +2. adamw_bnb_8bit is also available in Trainer, but a third-party integration is provided below for demonstration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_81.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..30cb6aa2b38f50c58f532fe89296309b194891e0 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_81.txt @@ -0,0 +1,3 @@ +For comparison, for a 3B-parameter model, like “google-t5/t5-3bâ€: +* A standard AdamW optimizer will need 24GB of GPU memory because it uses 8 bytes for each parameter (83 => 24GB) +* Adafactor optimizer will need more than 12GB. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_82.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c3c29add3ef1640660d3e0e194f9ce4de87f6f3 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_82.txt @@ -0,0 +1 @@ +It uses slightly more than 4 bytes for each parameter, so 43 and then some extra. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_83.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b18fc2ddfbf644d31fa54eec4541da1dbeb64d4 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_83.txt @@ -0,0 +1 @@ +* 8bit BNB quantized optimizer will use only (2*3) 6GB if all optimizer states are quantized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_84.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..af1b2d4b6c47cd8f0763e2e301363a0bd7e939be --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_84.txt @@ -0,0 +1,2 @@ +Adafactor +Adafactor doesn't store rolling averages for each element in weight matrices. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_85.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..facb1299e4d3fcb1fbdff2c6baed7a5258cd655a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_85.txt @@ -0,0 +1,2 @@ +Instead, it keeps aggregated information +(sums of rolling averages row- and column-wise), significantly reducing its footprint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_86.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..53563ce7c6af7418bad7fb57b7c8a554265342f6 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_86.txt @@ -0,0 +1,2 @@ +However, compared to Adam, +Adafactor may have slower convergence in certain cases. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_87.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff5eba43d42309ffba7518ec5d5c6914bb691e26 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_87.txt @@ -0,0 +1,5 @@ +You can switch to Adafactor by setting optim="adafactor" in [TrainingArguments]: +py +training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args) +Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training) +you can notice up to 3x improvement while maintaining the throughput! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_88.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..47d5772aae11b12ae1a2a581df83162a19350c3b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_88.txt @@ -0,0 +1,2 @@ +However, as mentioned before, the convergence of +Adafactor can be worse than Adam. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_89.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..93c84603807f1f2022190841c0ed05558a5045e5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_89.txt @@ -0,0 +1,2 @@ +8-bit Adam +Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_90.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..879fd57d48e53d73b7406a3d2300afe3916582ee --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_90.txt @@ -0,0 +1,2 @@ +Quantization +means that it stores the state with lower precision and dequantizes it only for the optimization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_91.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..53c6d35516bbd6c9057ade2bd5779fd084676856 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_91.txt @@ -0,0 +1,2 @@ +This is similar to the +idea behind mixed precision training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_92.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..638d3f93540f0b80bac65086fd90f2d58ef0ba3c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_92.txt @@ -0,0 +1,4 @@ +To use adamw_bnb_8bit, you simply need to set optim="adamw_bnb_8bit" in [TrainingArguments]: +py +training_args = TrainingArguments(per_device_train_batch_size=4, optim="adamw_bnb_8bit", **default_args) +However, we can also use a third-party implementation of the 8-bit optimizer for demonstration purposes to see how that can be integrated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_93.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..5129d713906bf3dd7bc6d38937edcb3d970496a6 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_93.txt @@ -0,0 +1,2 @@ +First, follow the installation guide in the GitHub repo to install the bitsandbytes library +that implements the 8-bit Adam optimizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_94.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..db963c4dc99322872e72c11db7904017917f5763 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_94.txt @@ -0,0 +1 @@ +Next you need to initialize the optimizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_95.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab5969d708114eb1ff866c8235fe737a171981df --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_95.txt @@ -0,0 +1,2 @@ +This involves two steps: +* First, group the model's parameters into two groups - one where weight decay should be applied, and the other one where it should not. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_96.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..202d84130db72be1e28c3f3332dd1d18f3610358 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_96.txt @@ -0,0 +1 @@ +Usually, biases and layer norm parameters are not weight decayed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_97.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ebb8cbb84a989ac48b097241b7d12ff4e1800ae --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_97.txt @@ -0,0 +1 @@ +* Then do some argument housekeeping to use the same parameters as the previously used AdamW optimizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_98.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..29ed74e7e3148d6e353659170fd3a7146d5e58b8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_98.txt @@ -0,0 +1,33 @@ +import bitsandbytes as bnb +from torch import nn +from transformers.trainer_pt_utils import get_parameter_names +training_args = TrainingArguments(per_device_train_batch_size=4, **default_args) +decay_parameters = get_parameter_names(model, [nn.LayerNorm]) +decay_parameters = [name for name in decay_parameters if "bias" not in name] +optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if n in decay_parameters], + "weight_decay": training_args.weight_decay, + }, + { + "params": [p for n, p in model.named_parameters() if n not in decay_parameters], + "weight_decay": 0.0, + }, +] +optimizer_kwargs = { + "betas": (training_args.adam_beta1, training_args.adam_beta2), + "eps": training_args.adam_epsilon, +} +optimizer_kwargs["lr"] = training_args.learning_rate +adam_bnb_optim = bnb.optim.Adam8bit( + optimizer_grouped_parameters, + betas=(training_args.adam_beta1, training_args.adam_beta2), + eps=training_args.adam_epsilon, + lr=training_args.learning_rate, +) + +Finally, pass the custom optimizer as an argument to the Trainer: +py +trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None)) +Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training), +you can expect to get about a 3x memory improvement and even slightly higher throughput as using Adafactor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_99.txt b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..c50e220ca7cf6488b585ff4b4ee31dc1daa4af21 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_gpu_one/chunk_99.txt @@ -0,0 +1,3 @@ +multi_tensor +pytorch-nightly introduced torch.optim._multi_tensor which should significantly speed up the optimizers for situations +with lots of small feature tensors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_special/chunk_10.txt b/chunked/content_aware_chunking/_perf_train_special/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b519cb7f8b35b092a05bebdcb1a67412e199796 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_special/chunk_10.txt @@ -0,0 +1 @@ +For example, you can run the run_glue.py script with the MPS backend automatically enabled without making any changes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_special/chunk_11.txt b/chunked/content_aware_chunking/_perf_train_special/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b8bceee900f42929b1a9bad9477cbd219f266ed --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_special/chunk_11.txt @@ -0,0 +1,15 @@ +export TASK_NAME=mrpc +python examples/pytorch/text-classification/run_glue.py \ + --model_name_or_path google-bert/bert-base-cased \ + --task_name $TASK_NAME \ +- --use_mps_device \ + --do_train \ + --do_eval \ + --max_seq_length 128 \ + --per_device_train_batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 3 \ + --output_dir /tmp/$TASK_NAME/ \ + --overwrite_output_dir + +Backends for distributed setups like gloo and nccl are not supported by the mps device which means you can only train on a single GPU with the MPS backend. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_special/chunk_12.txt b/chunked/content_aware_chunking/_perf_train_special/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..82841c3e97057c015600efc08e241a81fa93700f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_special/chunk_12.txt @@ -0,0 +1 @@ +You can learn more about the MPS backend in the Introducing Accelerated PyTorch Training on Mac blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_special/chunk_5.txt b/chunked/content_aware_chunking/_perf_train_special/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..edd15974b002e15545e7872c0c86f38dfbcf4f30 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_special/chunk_5.txt @@ -0,0 +1 @@ +To avoid this, you should set the environment variable PYTORCH_ENABLE_MPS_FALLBACK=1 to use the CPU kernels instead (you'll still see a UserWarning). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_special/chunk_6.txt b/chunked/content_aware_chunking/_perf_train_special/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea21084d9f4af84694ac19e7b6849a279fb5c5ba --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_special/chunk_6.txt @@ -0,0 +1 @@ +If you run into any other errors, please open an issue in the PyTorch repository because the [Trainer] only integrates the MPS backend. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_special/chunk_7.txt b/chunked/content_aware_chunking/_perf_train_special/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdb05fb14b9073e73d7f8247d45d3470e649eb2d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_special/chunk_7.txt @@ -0,0 +1,7 @@ +With the mps device set, you can: + +train larger networks or batch sizes locally +reduce data retrieval latency because the GPU's unified memory architecture allows direct access to the full memory store +reduce costs because you don't need to train on cloud-based GPUs or add additional local GPUs + +Get started by making sure you have PyTorch installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_special/chunk_8.txt b/chunked/content_aware_chunking/_perf_train_special/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..067ba83c4c34b68cf569e5f224b2e3ac028dfd72 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_special/chunk_8.txt @@ -0,0 +1 @@ +MPS acceleration is supported on macOS 12.3+. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_special/chunk_9.txt b/chunked/content_aware_chunking/_perf_train_special/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd15f29624342413432ae5664ddc91c1fdbf76e9 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_special/chunk_9.txt @@ -0,0 +1,2 @@ +pip install torch torchvision torchaudio +[TrainingArguments] uses the mps device by default if it's available which means you don't need to explicitly set the device. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_25.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c1fa35d9d443378e85c0df65a5d8f1a9cac4170 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_25.txt @@ -0,0 +1 @@ +All of the above warnings do not apply to TPU VMs! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_26.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..50f8766ca14a8879cca9bb8ac32ada0e6457b9b6 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_26.txt @@ -0,0 +1 @@ +This is an opinionated document, so here’s our opinion: Avoid using TPU Node if possible. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_27.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..96fd6671fbc664dec05a402ee22d7925a7cefde2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_27.txt @@ -0,0 +1 @@ +It is more confusing and more difficult to debug than TPU VMs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_28.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7da167a075995014f73eea9f06886bbda62c9eb --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_28.txt @@ -0,0 +1 @@ +It is also likely to be unsupported in future - Google’s latest TPU, TPUv4, can only be accessed as a TPU VM, which suggests that TPU Nodes are increasingly going to become a “legacy†access method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_29.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc7c9a426718dcfb3b8917c15d167d56f27f7861 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_29.txt @@ -0,0 +1 @@ +However, we understand that the only free TPU access is on Colab and Kaggle Kernels, which uses TPU Node - so we’ll try to explain how to handle it if you have to! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_30.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4c4bf9175fea373b81d1b13f77f52ce3b733acb --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_30.txt @@ -0,0 +1 @@ +Check the TPU example notebook for code samples that explain this in more detail. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_31.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..a72c35f597f9a4e9c5d5f3f026cbf57b2fcc79db --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_31.txt @@ -0,0 +1 @@ +What sizes of TPU are available? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_32.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..80b77dd6231ea9716f09057b1754af5eed42e053 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_32.txt @@ -0,0 +1 @@ +A single TPU (a v2-8/v3-8/v4-8) runs 8 replicas. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_33.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..57263608bc9236e0768fd1ca422f9f813d959293 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_33.txt @@ -0,0 +1 @@ +TPUs exist in pods that can run hundreds or thousands of replicas simultaneously. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_34.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7f53b7e975354cf435c721acc93b011ca5ef815 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_34.txt @@ -0,0 +1 @@ +When you use more than a single TPU but less than a whole pod (for example, a v3-32), your TPU fleet is referred to as a pod slice. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_35.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f6cb30bd4a0ee2edf096ffd0a057e3fad405bb2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_35.txt @@ -0,0 +1 @@ +When you access a free TPU via Colab, you generally get a single v2-8 TPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_36.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f4b5a1100cbe1b732266ec98bc4b918b60a9682 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_36.txt @@ -0,0 +1 @@ +I keep hearing about this XLA thing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_37.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..92b9e36433acb742a9a7ba7e42c6870c464981c2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_37.txt @@ -0,0 +1 @@ +What’s XLA, and how does it relate to TPUs? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_38.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..47b6d3d2ea979f3f0e0db72170f38d4f0b79d419 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_38.txt @@ -0,0 +1 @@ +XLA is an optimizing compiler, used by both TensorFlow and JAX. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_39.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..df956394f1bca49ea5f75394aaf951c588ae05a8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_39.txt @@ -0,0 +1 @@ +In JAX it is the only compiler, whereas in TensorFlow it is optional (but mandatory on TPU!). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_40.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c236d91e3ec7b8672fb84b184f3428de5195ae5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_40.txt @@ -0,0 +1 @@ +The easiest way to enable it when training a Keras model is to pass the argument jit_compile=True to model.compile(). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_41.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5f01745ea6a78fd1d36373df1b9e07f9c89767d --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_41.txt @@ -0,0 +1 @@ +If you don’t get any errors and performance is good, that’s a great sign that you’re ready to move to TPU! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_42.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..916a9fc5cde060907e050ab17ea62ed28916e36e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_42.txt @@ -0,0 +1 @@ +Debugging on TPU is generally a bit harder than on CPU/GPU, so we recommend getting your code running on CPU/GPU with XLA first before trying it on TPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_43.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8012d54f32b5e1d5da344183972aa79e728e904 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_43.txt @@ -0,0 +1 @@ +You don’t have to train for long, of course - just for a few steps to make sure that your model and data pipeline are working like you expect them to. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_44.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..49464c3469b8c3fff32afe710bec921d15205a95 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_44.txt @@ -0,0 +1 @@ +XLA compiled code is usually faster - so even if you’re not planning to run on TPU, adding jit_compile=True can improve your performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_45.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..643d37b0f5298763d5eb27e1323147ab37eebd36 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_45.txt @@ -0,0 +1 @@ +Be sure to note the caveats below about XLA compatibility, though! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_46.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5bfc36827bb689455601cf0ac58e637c3d11312 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_46.txt @@ -0,0 +1 @@ +Tip born of painful experience: Although using jit_compile=True is a good way to get a speed boost and test if your CPU/GPU code is XLA-compatible, it can actually cause a lot of problems if you leave it in when actually training on TPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_47.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..251824f6f8cddc1a55fd8c7cbb094a2a3c8bd97a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_47.txt @@ -0,0 +1 @@ +XLA compilation will happen implicitly on TPU, so remember to remove that line before actually running your code on a TPU! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_48.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..923675006180294f434e247c5e302f4353f2eafe --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_48.txt @@ -0,0 +1 @@ +How do I make my model XLA compatible? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_49.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b60b1bf5d2eac1867912f0fcb01e584938361c6 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_49.txt @@ -0,0 +1 @@ +In many cases, your code is probably XLA-compatible already! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_50.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bdb68b654123597a4784e14650479c90da468ae --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_50.txt @@ -0,0 +1 @@ +However, there are a few things that work in normal TensorFlow that don’t work in XLA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_51.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb454b89de8d54320e22c897df77ea689e94bbfe --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_51.txt @@ -0,0 +1,3 @@ +We’ve distilled them into three core rules below: + +🤗Specific HuggingFace Tip🤗: We’ve put a lot of effort into rewriting our TensorFlow models and loss functions to be XLA-compatible. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_52.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..28bdd39e3e4a1e36965a9a29af0b4e4921f13f13 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_52.txt @@ -0,0 +1 @@ +Our models and loss functions generally obey rule #1 and #2 by default, so you can skip over them if you’re using transformers models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_53.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..e065f16b4d31103302348996a3540f7e4af4784c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_53.txt @@ -0,0 +1 @@ +Don’t forget about these rules when writing your own models and loss functions, though! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_54.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..a957bbb710f28a409529dda5933e46885e326f1a --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_54.txt @@ -0,0 +1,2 @@ +XLA Rule #1: Your code cannot have “data-dependent conditionals†+What that means is that any if statement cannot depend on values inside a tf.Tensor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_55.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..c218be9c43e6921f3d4c9dc62a662de1ce308779 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_55.txt @@ -0,0 +1 @@ +For example, this code block cannot be compiled with XLA! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_56.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..39391b0d16ab48ac8210b13aac9e7029839f8871 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_56.txt @@ -0,0 +1,4 @@ +python +if tf.reduce_sum(tensor) > 10: + tensor = tensor / 2.0 +This might seem very restrictive at first, but most neural net code doesn’t need to do this. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_57.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..d49bf107f7102979e709a8110e986edffa53b0f2 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_57.txt @@ -0,0 +1,5 @@ +You can often get around this restriction by using tf.cond (see the documentation here) or by removing the conditional and finding a clever math trick with indicator variables instead, like so: +python +sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32) +tensor = tensor / (1.0 + sum_over_10) +This code has exactly the same effect as the code above, but by avoiding a conditional, we ensure it will compile with XLA without problems! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_58.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..8223c449d34f88b8a300ea8cceeac497d3c5e7a8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_58.txt @@ -0,0 +1,2 @@ +XLA Rule #2: Your code cannot have “data-dependent shapes†+What this means is that the shape of all of the tf.Tensor objects in your code cannot depend on their values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_59.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..94a8578248c072f5558c5baff259df09a76f4294 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_59.txt @@ -0,0 +1 @@ +For example, the function tf.unique cannot be compiled with XLA, because it returns a tensor containing one instance of each unique value in the input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_60.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4baf1c9647169d0faba090c90f6d3ec49bd9b85 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_60.txt @@ -0,0 +1 @@ +The shape of this output will obviously be different depending on how repetitive the input Tensor was, and so XLA refuses to handle it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_61.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..715314e5fa0ebe5da968df94055cb2cd1c4a749b --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_61.txt @@ -0,0 +1 @@ +In general, most neural network code obeys rule #2 by default. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_62.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee0d1cb0369662774fa59cf659e693596481f302 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_62.txt @@ -0,0 +1 @@ +However, there are a few common cases where it becomes a problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_63.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..a512589d18a7ce649941ea7005dbbad699514c7c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_63.txt @@ -0,0 +1 @@ +One very common one is when you use label masking, setting your labels to a negative value to indicate that those positions should be ignored when computing the loss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_64.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fee1a90bcf1bf2f6645d2faa32d3e80a37f6cf5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_64.txt @@ -0,0 +1,8 @@ +If you look at NumPy or PyTorch loss functions that support label masking, you will often see code like this that uses boolean indexing: +python +label_mask = labels >= 0 +masked_outputs = outputs[label_mask] +masked_labels = labels[label_mask] +loss = compute_loss(masked_outputs, masked_labels) +mean_loss = torch.mean(loss) +This code is totally fine in NumPy or PyTorch, but it breaks in XLA! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_65.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9e07966c04eee20748a971960433112b34835f9 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_65.txt @@ -0,0 +1 @@ +Why? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_66.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..cce6d0bd5bb9a816422310ebd1c556b5138a04de --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_66.txt @@ -0,0 +1 @@ +Because the shape of masked_outputs and masked_labels depends on how many positions are masked - that makes it a data-dependent shape. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_67.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..25fe4a15cda62cc36ca2b54cb15889b1235ebbc8 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_67.txt @@ -0,0 +1 @@ +However, just like for rule #1, we can often rewrite this code to yield exactly the same output without any data-dependent shapes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_68.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf79dfc3a01e3b69e91500b381622e3a96f6d708 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_68.txt @@ -0,0 +1,6 @@ +python +label_mask = tf.cast(labels >= 0, tf.float32) +loss = compute_loss(outputs, labels) +loss = loss * label_mask # Set negative label positions to 0 +mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask) +Here, we avoid data-dependent shapes by computing the loss for every position, but zeroing out the masked positions in both the numerator and denominator when we calculate the mean, which yields exactly the same result as the first block while maintaining XLA compatibility. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_69.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..5827468182e4cab97f0f9654ca14005fffe1bd55 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_69.txt @@ -0,0 +1 @@ +Note that we use the same trick as in rule #1 - converting a tf.bool to tf.float32 and using it as an indicator variable. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_70.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..3390f96c85d6dfc2b3d32b30d79dbf474d199131 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_70.txt @@ -0,0 +1 @@ +This is a really useful trick, so remember it if you need to convert your own code to XLA! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_71.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..660c02d1b1d781538657a8e7eedd7f34e8229da5 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_71.txt @@ -0,0 +1,2 @@ +XLA Rule #3: XLA will need to recompile your model for every different input shape it sees +This is the big one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_72.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffc58c646f88e0d6253f4d4d68b2c2db919f9317 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_72.txt @@ -0,0 +1 @@ +What this means is that if your input shapes are very variable, XLA will have to recompile your model over and over, which will create huge performance problems. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_73.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..475c2a8c18ba4e5740903be901d69b2d1395d0c7 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_73.txt @@ -0,0 +1 @@ +This commonly arises in NLP models, where input texts have variable lengths after tokenization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_74.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..4213a43772e5344eb7f3f93b7382aae45f424330 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_74.txt @@ -0,0 +1 @@ +In other modalities, static shapes are more common and this rule is much less of a problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_75.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..857b77fd83ef62aff5dbfd1eade6dabe2e983453 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_75.txt @@ -0,0 +1 @@ +How can you get around rule #3? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_76.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..81cc59e37d132fe683609e58f3f06d0d43656153 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_76.txt @@ -0,0 +1 @@ +The key is padding - if you pad all your inputs to the same length, and then use an attention_mask, you can get the same results as you’d get from variable shapes, but without any XLA issues. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_77.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a44260438725c1653d7de7bf1b226fb742048ed --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_77.txt @@ -0,0 +1 @@ +However, excessive padding can cause severe slowdown too - if you pad all your samples to the maximum length in the whole dataset, you might end up with batches consisting endless padding tokens, which will waste a lot of compute and memory! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_78.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..93758fa9b0c7fb3682fa80cfdfc35e48c0c4ec6f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_78.txt @@ -0,0 +1 @@ +There isn’t a perfect solution to this problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_79.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6603684f97e72e889de364d15d20358e1e7f726 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_79.txt @@ -0,0 +1 @@ +However, you can try some tricks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_80.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..8265e5bf4bda393307ab360c69b75f0acebe807e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_80.txt @@ -0,0 +1 @@ +One very useful trick is to pad batches of samples up to a multiple of a number like 32 or 64 tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_81.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef08474091b81e870c48960ed3fa46d37d39058e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_81.txt @@ -0,0 +1 @@ +This often only increases the number of tokens by a small amount, but it hugely reduces the number of unique input shapes, because every input shape now has to be a multiple of 32 or 64. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_82.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..28761c901580f1aa12a3a5c40eec660e3685ed2e --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_82.txt @@ -0,0 +1 @@ +Fewer unique input shapes means fewer XLA compilations! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_83.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..67496acfc2380ebd335b57255feb01ee22a08d43 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_83.txt @@ -0,0 +1 @@ +🤗Specific HuggingFace Tip🤗: Our tokenizers and data collators have methods that can help you here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_84.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2be89eedfcbd35d6c7f5942c6004d0053675ddb --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_84.txt @@ -0,0 +1 @@ +You can use padding="max_length" or padding="longest" when calling tokenizers to get them to output padded data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_85.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..29a4c16fda33d9b2d9015a251ea64316807c876c --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_85.txt @@ -0,0 +1 @@ +Our tokenizers and data collators also have a pad_to_multiple_of argument that you can use to reduce the number of unique input shapes you see! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_86.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..8151be8a1c68718d0aff975e92466305fb4edeee --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_86.txt @@ -0,0 +1 @@ +How do I actually train my model on TPU? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_87.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..f97c9a2b8ddd890bbe6a660834a8f041fa526f9f --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_87.txt @@ -0,0 +1 @@ +Once your training is XLA-compatible and (if you’re using TPU Node / Colab) your dataset has been prepared appropriately, running on TPU is surprisingly easy! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_88.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e4d24cfe9d833ae275808e1f6b4835fdd7b8560 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_88.txt @@ -0,0 +1 @@ +All you really need to change in your code is to add a few lines to initialize your TPU, and to ensure that your model and dataset are created inside a TPUStrategy scope. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_89.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..b69b13eeaf351ab7cf1b40193204c9c27e4c05d9 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_89.txt @@ -0,0 +1 @@ +Take a look at our TPU example notebook to see this in action! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_90.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..97556850e1a9179d369bfa9296d5ac232d9499f0 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_90.txt @@ -0,0 +1,10 @@ +Summary +There was a lot in here, so let’s summarize with a quick checklist you can follow when you want to get your model ready for TPU training: + +Make sure your code follows the three rules of XLA +Compile your model with jit_compile=True on CPU/GPU and confirm that you can train it with XLA +Either load your dataset into memory or use a TPU-compatible dataset loading approach (see notebook) +Migrate your code either to Colab (with accelerator set to “TPUâ€) or a TPU VM on Google Cloud +Add TPU initializer code (see notebook) +Create your TPUStrategy and make sure dataset loading and model creation are inside the strategy.scope() (see notebook) +Don’t forget to take jit_compile=True out again when you move to TPU! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_91.txt b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..32beca617ddf96ed0e1522966dd012a991f33145 --- /dev/null +++ b/chunked/content_aware_chunking/_perf_train_tpu_tf/chunk_91.txt @@ -0,0 +1,3 @@ +ðŸ™ðŸ™ðŸ™ðŸ¥ºðŸ¥ºðŸ¥º +Call model.fit() +You did it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_10.txt b/chunked/content_aware_chunking/_performance/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..99f6029256affd3555c96370bdca0513fc42bbf8 --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_10.txt @@ -0,0 +1 @@ +However, there are also techniques that are specific to multi-GPU or CPU training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_11.txt b/chunked/content_aware_chunking/_performance/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef8a770d4a8f7830fd157d5211cde58e2d98ab8a --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_11.txt @@ -0,0 +1,2 @@ +We cover them in +separate sections. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_12.txt b/chunked/content_aware_chunking/_performance/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b2b47c8e6dc26f4d3e3f12805e31738543202ed --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_12.txt @@ -0,0 +1 @@ +Methods and tools for efficient training on a single GPU: start here to learn common approaches that can help optimize GPU memory utilization, speed up the training, or both. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_13.txt b/chunked/content_aware_chunking/_performance/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c723b70d56dfb16272374f7b094b90597025320b --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_13.txt @@ -0,0 +1 @@ +Multi-GPU training section: explore this section to learn about further optimization methods that apply to a multi-GPU settings, such as data, tensor, and pipeline parallelism. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_14.txt b/chunked/content_aware_chunking/_performance/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..96651029667eefd27f8329014b4136e9718e636f --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_14.txt @@ -0,0 +1 @@ +CPU training section: learn about mixed precision training on CPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_15.txt b/chunked/content_aware_chunking/_performance/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..0179fe3fde751268fd3d4b7c454a3e75506fccb4 --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_15.txt @@ -0,0 +1 @@ +Efficient Training on Multiple CPUs: learn about distributed CPU training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_16.txt b/chunked/content_aware_chunking/_performance/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d0ea5bb8290811708aa49b7c8a35a7bc745389b --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_16.txt @@ -0,0 +1 @@ +Training on TPU with TensorFlow: if you are new to TPUs, refer to this section for an opinionated introduction to training on TPUs and using XLA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_17.txt b/chunked/content_aware_chunking/_performance/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..78509d0d00f6622da2a3f4307b26264585714013 --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_17.txt @@ -0,0 +1 @@ +Custom hardware for training: find tips and tricks when building your own deep learning rig. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_18.txt b/chunked/content_aware_chunking/_performance/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..fef3cd89bc685b44742edf3a53d6341ea0897de1 --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_18.txt @@ -0,0 +1,4 @@ +Hyperparameter Search using Trainer API + +Inference +Efficient inference with large models in a production environment can be as challenging as training them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_19.txt b/chunked/content_aware_chunking/_performance/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..43a430aba2f2cfedacea94a9771edbd74528931c --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_19.txt @@ -0,0 +1,2 @@ +In the following +sections we go through the steps to run inference on CPU and single/multi-GPU setups. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_20.txt b/chunked/content_aware_chunking/_performance/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e0c0639c4700d5e0445c8048707eb4a29787e43 --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_20.txt @@ -0,0 +1,7 @@ +Inference on a single CPU +Inference on a single GPU +Multi-GPU inference +XLA Integration for TensorFlow Models + +Training and inference +Here you'll find techniques, tips and tricks that apply whether you are training a model, or running inference with it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_21.txt b/chunked/content_aware_chunking/_performance/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..b408144e50aaf6f3d8e8f3f6a31388753f45b929 --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_21.txt @@ -0,0 +1,6 @@ +Instantiating a big model +Troubleshooting performance issues + +Contribute +This document is far from being complete and a lot more needs to be added, so if you have additions or corrections to +make please don't hesitate to open a PR or if you aren't sure start an Issue and we can discuss the details there. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_22.txt b/chunked/content_aware_chunking/_performance/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..100332b71102d1fd75d4177681a4b5fd2ef83fec --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_22.txt @@ -0,0 +1,2 @@ +When making contributions that A is better than B, please try to include a reproducible benchmark and/or a link to the +source of that information (unless it comes directly from you). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_6.txt b/chunked/content_aware_chunking/_performance/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..97dfaa0d3e3a1190da7ae3ac59e788b27b8fa4e4 --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_6.txt @@ -0,0 +1 @@ +Use this document as your starting point to navigate further to the methods that match your scenario. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_7.txt b/chunked/content_aware_chunking/_performance/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9ede54f43d2f0da14e08fd0aae41331c7333588 --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_7.txt @@ -0,0 +1,2 @@ +Training +Training large transformer models efficiently requires an accelerator such as a GPU or TPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_8.txt b/chunked/content_aware_chunking/_performance/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b63af78a4a81677b3acdde9e5b5d7c2abb323300 --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_8.txt @@ -0,0 +1,2 @@ +The most common case is where +you have a single GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_performance/chunk_9.txt b/chunked/content_aware_chunking/_performance/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6476b417de3a9bf6e84cc96c2757d488d8cc876e --- /dev/null +++ b/chunked/content_aware_chunking/_performance/chunk_9.txt @@ -0,0 +1,2 @@ +The methods that you can apply to improve training efficiency on a single GPU extend to other setups +such as multiple GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_14.txt b/chunked/content_aware_chunking/_perplexity/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d5d8d3c05244ed01fe22f8bcb8d0dc532512f07 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_14.txt @@ -0,0 +1,3 @@ +This is quick to compute since the perplexity of each segment can be computed in one forward pass, but serves as a poor +approximation of the fully-factorized perplexity and will typically yield a higher (worse) PPL because the model will +have less context at most of the prediction steps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_15.txt b/chunked/content_aware_chunking/_perplexity/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..986c59806a24b5e18e1020ab4d168b44821e8136 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_15.txt @@ -0,0 +1 @@ +Instead, the PPL of fixed-length models should be evaluated with a sliding-window strategy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_16.txt b/chunked/content_aware_chunking/_perplexity/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..c685b06f94c00124ac4bc0104d590ace36214c22 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_16.txt @@ -0,0 +1,2 @@ +This involves repeatedly +sliding the context window so that the model has more context when making each prediction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_17.txt b/chunked/content_aware_chunking/_perplexity/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..08cfc75da75637d4be56ebee266a169134ef944a --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_17.txt @@ -0,0 +1,2 @@ +This is a closer approximation to the true decomposition of the sequence probability and will typically yield a more +favorable score. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_18.txt b/chunked/content_aware_chunking/_perplexity/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..b25cffcd8a711e6c7c11f52bba0b89f4bf58a2e6 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_18.txt @@ -0,0 +1 @@ +The downside is that it requires a separate forward pass for each token in the corpus. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_19.txt b/chunked/content_aware_chunking/_perplexity/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0606644d2bb7cc0c02c7771af66bcabefe23b2f --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_19.txt @@ -0,0 +1,3 @@ +A good +practical compromise is to employ a strided sliding window, moving the context by larger strides rather than sliding by +1 token a time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_20.txt b/chunked/content_aware_chunking/_perplexity/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b70252fcc2840e51bb57648d51589af3a9a5985 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_20.txt @@ -0,0 +1,2 @@ +This allows computation to proceed much faster while still giving the model a large context to make +predictions at each step. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_21.txt b/chunked/content_aware_chunking/_perplexity/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..18692dc77b95276f035f470eef1dfa7b88e11010 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_21.txt @@ -0,0 +1,2 @@ +Example: Calculating perplexity with GPT-2 in 🤗 Transformers +Let's demonstrate this process with GPT-2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_22.txt b/chunked/content_aware_chunking/_perplexity/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..a629b48db1768ee9c607522e92951495ee83ea06 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_22.txt @@ -0,0 +1,8 @@ +thon +from transformers import GPT2LMHeadModel, GPT2TokenizerFast +device = "cuda" +model_id = "openai-community/gpt2-large" +model = GPT2LMHeadModel.from_pretrained(model_id).to(device) +tokenizer = GPT2TokenizerFast.from_pretrained(model_id) + +We'll load in the WikiText-2 dataset and evaluate the perplexity using a few different sliding-window strategies. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_23.txt b/chunked/content_aware_chunking/_perplexity/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..7112b1720825b5533f665380b6216e95d720fc50 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_23.txt @@ -0,0 +1,3 @@ +Since +this dataset is small and we're just doing one forward pass over the set, we can just load and encode the entire +dataset in memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_24.txt b/chunked/content_aware_chunking/_perplexity/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..f16258d10fe074573968a12b81a56e522c4ee398 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_24.txt @@ -0,0 +1,7 @@ +thon +from datasets import load_dataset +test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test") +encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt") + +With 🤗 Transformers, we can simply pass the input_ids as the labels to our model, and the average negative +log-likelihood for each token is returned as the loss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_25.txt b/chunked/content_aware_chunking/_perplexity/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc5da580031911463bf66c85aaecbc5ed38ceb37 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_25.txt @@ -0,0 +1,2 @@ +With our sliding window approach, however, there is overlap in +the tokens we pass to the model at each iteration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_26.txt b/chunked/content_aware_chunking/_perplexity/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec92b66edecf59092e2d42a560c7da0a797a9af7 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_26.txt @@ -0,0 +1,2 @@ +We don't want the log-likelihood for the tokens we're just treating +as context to be included in our loss, so we can set these targets to -100 so that they are ignored. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_27.txt b/chunked/content_aware_chunking/_perplexity/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..a266a9b6577fd7474de33215d02fbacce660e7f4 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_27.txt @@ -0,0 +1,2 @@ +The following +is an example of how we could do this with a stride of 512. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_28.txt b/chunked/content_aware_chunking/_perplexity/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..560cf71051ed952f96ae9f102f51459a61a3c8b2 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_28.txt @@ -0,0 +1,3 @@ +This means that the model will have at least 512 tokens +for context when calculating the conditional likelihood of any one token (provided there are 512 preceding tokens +available to condition on). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_29.txt b/chunked/content_aware_chunking/_perplexity/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..6978a4586fdd1515207a89fe8b71a682d8089502 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_29.txt @@ -0,0 +1,19 @@ +thon +import torch +from tqdm import tqdm +max_length = model.config.n_positions +stride = 512 +seq_len = encodings.input_ids.size(1) +nlls = [] +prev_end_loc = 0 +for begin_loc in tqdm(range(0, seq_len, stride)): + end_loc = min(begin_loc + max_length, seq_len) + trg_len = end_loc - prev_end_loc # may be different from stride on last loop + input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device) + target_ids = input_ids.clone() + target_ids[:, :-trg_len] = -100 +with torch.no_grad(): + outputs = model(input_ids, labels=target_ids) + + # loss is calculated using CrossEntropyLoss which averages over valid labels + # N.B. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_30.txt b/chunked/content_aware_chunking/_perplexity/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..f53d04742bedfde04d3a6ca5a354daa6cd94b744 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_30.txt @@ -0,0 +1,14 @@ +the model only calculates loss over trg_len - 1 labels, because it internally shifts the labels + # to the left by 1. + neg_log_likelihood = outputs.loss + +nlls.append(neg_log_likelihood) + +prev_end_loc = end_loc +if end_loc == seq_len: + break + +ppl = torch.exp(torch.stack(nlls).mean()) + +Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window +strategy we discussed above. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_31.txt b/chunked/content_aware_chunking/_perplexity/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..39899fc655781d3b021783603a1856b2f9ca277c --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_31.txt @@ -0,0 +1,2 @@ +The smaller the stride, the more context the model will have in making each prediction, +and the better the reported perplexity will typically be. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_32.txt b/chunked/content_aware_chunking/_perplexity/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a4bd5a9bd2d564fc4b456073c0d6b2dfde8de93 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_32.txt @@ -0,0 +1 @@ +When we run the above with stride = 1024, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_33.txt b/chunked/content_aware_chunking/_perplexity/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9ea7e257e54cf8e6ffa8e45ef1a64c9cf8f4a39 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_33.txt @@ -0,0 +1,2 @@ +no overlap, the resulting PPL is 19.44, which is about the same +as the 19.93 reported in the GPT-2 paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_34.txt b/chunked/content_aware_chunking/_perplexity/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..a389d4dac7209991e21fed7e15a2e43c46520492 --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_34.txt @@ -0,0 +1,2 @@ +By using stride = 512 and thereby employing our striding window +strategy, this jumps down to 16.45. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_perplexity/chunk_35.txt b/chunked/content_aware_chunking/_perplexity/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..49d23a01d8845761be7323c7c6b378029e751eba --- /dev/null +++ b/chunked/content_aware_chunking/_perplexity/chunk_35.txt @@ -0,0 +1,2 @@ +This is not only a more favorable score, but is calculated in a way that is +closer to the true autoregressive decomposition of a sequence likelihood. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_10.txt b/chunked/content_aware_chunking/_philosophy/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0e98f218b04fd0283d3e4101c292b3b05b69368 --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_10.txt @@ -0,0 +1,2 @@ +The code is usually as close to the original code base as possible which means some PyTorch code may be not as + pytorchic as it could be as a result of being converted TensorFlow code and vice versa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_11.txt b/chunked/content_aware_chunking/_philosophy/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..b49d9d004021b2871e7e1174c1d27e391c48eebf --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_11.txt @@ -0,0 +1,5 @@ +A few other goals: + +Expose the models' internals as consistently as possible: + +We give access, using a single API, to the full hidden-states and attention weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_12.txt b/chunked/content_aware_chunking/_philosophy/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b7108b33ce87587df7aef28f9cbe5c7ebf9f78c --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_12.txt @@ -0,0 +1 @@ +The preprocessing classes and base model APIs are standardized to easily switch between models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_13.txt b/chunked/content_aware_chunking/_philosophy/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..f403d1110a333a31283e4e219b047a7a855e802e --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_13.txt @@ -0,0 +1,3 @@ +Incorporate a subjective selection of promising tools for fine-tuning and investigating these models: + +A simple and consistent way to add new tokens to the vocabulary and embeddings for fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_14.txt b/chunked/content_aware_chunking/_philosophy/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..b83c3e4c2dcd3aaf7e03c76beee6e53d26ce3506 --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_14.txt @@ -0,0 +1 @@ +Simple ways to mask and prune Transformer heads. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_15.txt b/chunked/content_aware_chunking/_philosophy/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4bec679fb51ee2284852e27e1d21c58cb6b87ff --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_15.txt @@ -0,0 +1 @@ +Easily switch between PyTorch, TensorFlow 2.0 and Flax, allowing training with one framework and inference with another. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_16.txt b/chunked/content_aware_chunking/_philosophy/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..312fa06ecaf31bccf28e9288710f363f672fac7f --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_16.txt @@ -0,0 +1,4 @@ +Main concepts +The library is built around three types of classes for each model: + +Model classes can be PyTorch models (torch.nn.Module), Keras models (tf.keras.Model) or JAX/Flax models (flax.linen.Module) that work with the pretrained weights provided in the library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_17.txt b/chunked/content_aware_chunking/_philosophy/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..87c04579305c84f226fbd80857d5b1314b42d999 --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_17.txt @@ -0,0 +1 @@ +Configuration classes store the hyperparameters required to build a model (such as the number of layers and hidden size). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_18.txt b/chunked/content_aware_chunking/_philosophy/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..156a7322da7eecdd1d4366f186d614ea79af6c08 --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_18.txt @@ -0,0 +1 @@ +You don't always need to instantiate these yourself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_19.txt b/chunked/content_aware_chunking/_philosophy/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..3192154a0b4de0b48da9a2720f4a7a86b3c67d95 --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_19.txt @@ -0,0 +1 @@ +In particular, if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_20.txt b/chunked/content_aware_chunking/_philosophy/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2ca4516b0dc10cc8ff2b073dde4430213bb89b3 --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_20.txt @@ -0,0 +1 @@ +Preprocessing classes convert the raw data into a format accepted by the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_21.txt b/chunked/content_aware_chunking/_philosophy/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..e968ee3cac77a7005f27ee435f6d0ed1ec1b877d --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_21.txt @@ -0,0 +1 @@ +A tokenizer stores the vocabulary for each model and provide methods for encoding and decoding strings in a list of token embedding indices to be fed to a model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_22.txt b/chunked/content_aware_chunking/_philosophy/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c34a6a2a47f2ddd806ce9aa5d4c304ccc62f6b9 --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_22.txt @@ -0,0 +1 @@ +Image processors preprocess vision inputs, feature extractors preprocess audio inputs, and a processor handles multimodal inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_23.txt b/chunked/content_aware_chunking/_philosophy/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3da5c822bdf290c117ea978c37a1c81ec5973a2 --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_23.txt @@ -0,0 +1,5 @@ +All these classes can be instantiated from pretrained instances, saved locally, and shared on the Hub with three methods: + +from_pretrained() lets you instantiate a model, configuration, and preprocessing class from a pretrained version either + provided by the library itself (the supported models can be found on the Model Hub) or + stored locally (or on a server) by the user. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_24.txt b/chunked/content_aware_chunking/_philosophy/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d939c5e0b2e9e96502063e372bc500c5e79efcb --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_24.txt @@ -0,0 +1,2 @@ +save_pretrained() lets you save a model, configuration, and preprocessing class locally so that it can be reloaded using + from_pretrained(). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_25.txt b/chunked/content_aware_chunking/_philosophy/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4c7cb99f990e649c14469fd8bcc8bc7c6f324d4 --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_25.txt @@ -0,0 +1 @@ +push_to_hub() lets you share a model, configuration, and a preprocessing class to the Hub, so it is easily accessible to everyone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_6.txt b/chunked/content_aware_chunking/_philosophy/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..0811836c29864309c2764a7f4da394be7be4c770 --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_6.txt @@ -0,0 +1 @@ +As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_7.txt b/chunked/content_aware_chunking/_philosophy/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..882826d57c971853f97092d38c017467b1f00272 --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_7.txt @@ -0,0 +1,3 @@ +If you want to + extend or build upon the library, just use regular Python, PyTorch, TensorFlow, Keras modules and inherit from the base + classes of the library to reuse functionalities like model loading and saving. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_8.txt b/chunked/content_aware_chunking/_philosophy/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e119acd92ef077edf4f470b7aca26087f4b031b --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_8.txt @@ -0,0 +1 @@ +If you'd like to learn more about our coding philosophy for models, check out our Repeat Yourself blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_philosophy/chunk_9.txt b/chunked/content_aware_chunking/_philosophy/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..daa1ff6ad940cb425e812d8f08284d6ee96905fd --- /dev/null +++ b/chunked/content_aware_chunking/_philosophy/chunk_9.txt @@ -0,0 +1,4 @@ +Provide state-of-the-art models with performances as close as possible to the original models: + +We provide at least one example for each architecture which reproduces a result provided by the official authors + of said architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_18.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..21b52fab6c4b2afde0649c0e373c07bc4d627444 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_18.txt @@ -0,0 +1 @@ +For a deep-dive comparison on Wav2Vec2 vs Whisper, refer to the Audio Transformers Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_19.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..5defb21c15ae8649e2c172c5d174b40654ea886a --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_19.txt @@ -0,0 +1 @@ +We really encourage you to check out the Hub for models in different languages, models specialized in your field, and more. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_20.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..13c498f1c02b5dcc379ddb991bb91233a9ea290f --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_20.txt @@ -0,0 +1,2 @@ +You can check out and compare model results directly from your browser on the Hub to see if it fits or +handles corner cases better than other ones. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_21.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..c14da7250d98259a88032637c805d751ec72f488 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_21.txt @@ -0,0 +1 @@ +And if you don't find a model for your use case, you can always start training your own! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_22.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..96b7a61eacf7463c7e6d585731de0689cc23e363 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_22.txt @@ -0,0 +1,9 @@ +If you have several inputs, you can pass your input as a list: +py +transcriber( + [ + "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac", + "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac", + ] +) +Pipelines are great for experimentation as switching from one model to another is trivial; however, there are some ways to optimize them for larger workloads than experimentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_23.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..80021132f5adf70ac43bf8686151ad6bc03eb183 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_23.txt @@ -0,0 +1,6 @@ +See the following guides that dive into iterating over whole datasets or using pipelines in a webserver: +of the docs: +* Using pipelines on a dataset +* Using pipelines for a webserver +Parameters +[pipeline] supports many parameters; some are task specific, and some are general to all pipelines. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_24.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..061036173a5d0368f8492d383f0bc5eb68b1b30b --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_24.txt @@ -0,0 +1,4 @@ +In general, you can specify parameters anywhere you want: + +transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1) +out = transcriber() # This will use my_parameter=1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_25.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c0d8273714a0003ad6d2fa184b9eb916061f3b8 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_25.txt @@ -0,0 +1 @@ +out = transcriber(, my_parameter=2) # This will override and use my_parameter=2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_26.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..8908dfa2a1e873407b6035e8182be2751bc9d3ea --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_26.txt @@ -0,0 +1 @@ +out = transcriber() # This will go back to using my_parameter=1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_27.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a88be4d64db40f664c2ecae353d0ea8831f89c --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_27.txt @@ -0,0 +1,3 @@ +Let's check out 3 important ones: +Device +If you use device=n, the pipeline automatically puts the model on the specified device. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_28.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..c89b7a38cba08976c573a12565c571b6dc47839b --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_28.txt @@ -0,0 +1 @@ +This will work regardless of whether you are using PyTorch or Tensorflow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_29.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..e91bc6a4d3fcc3124532dd56e7b3b99e3b9dc608 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_29.txt @@ -0,0 +1,4 @@ +py +transcriber = pipeline(model="openai/whisper-large-v2", device=0) +If the model is too large for a single GPU and you are using PyTorch, you can set device_map="auto" to automatically +determine how to load and store the model weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_30.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..55ed3681379b75e65e1094164789017610924853 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_30.txt @@ -0,0 +1,8 @@ +Using the device_map argument requires the 🤗 Accelerate +package: + +pip install --upgrade accelerate +The following code automatically loads and stores model weights across devices: +py +transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto") +Note that if device_map="auto" is passed, there is no need to add the argument device=device when instantiating your pipeline as you may encounter some unexpected behavior! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_31.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..8645dc2769699aa56f570c53498bc7f9658bd456 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_31.txt @@ -0,0 +1,2 @@ +Batch size +By default, pipelines will not batch inference for reasons explained in detail here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_32.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..02b1a630e0396f60490ecffcdc5b18d6750fdb0c --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_32.txt @@ -0,0 +1 @@ +The reason is that batching is not necessarily faster, and can actually be quite slower in some cases. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_33.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..00bf62ef8c070238fd495e363d21fc33d9e8eb60 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_33.txt @@ -0,0 +1,7 @@ +But if it works in your use case, you can use: +py +transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2) +audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)] +texts = transcriber(audio_filenames) +This runs the pipeline on the 4 provided audio files, but it will pass them in batches of 2 +to the model (which is on a GPU, where batching is more likely to help) without requiring any further code from you. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_34.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..081d3b20c11a84f8a670a64b19696099c9933de6 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_34.txt @@ -0,0 +1 @@ +The output should always match what you would have received without batching. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_35.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..91f9f6a29337ae8858b34da66235afa16bf045e3 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_35.txt @@ -0,0 +1 @@ +It is only meant as a way to help you get more speed out of a pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_36.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..db4a4fcc945db6e237431f9dc66b7e287dbb63a3 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_36.txt @@ -0,0 +1 @@ +Pipelines can also alleviate some of the complexities of batching because, for some pipelines, a single item (like a long audio file) needs to be chunked into multiple parts to be processed by a model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_37.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e96398ba82a11a9e861b7edf5630bc1be5f58e7 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_37.txt @@ -0,0 +1 @@ +The pipeline performs this chunk batching for you. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_38.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f2b593eff29d72a243a3722e0e68e7fbdb83317 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_38.txt @@ -0,0 +1,2 @@ +Task specific parameters +All tasks provide task specific parameters which allow for additional flexibility and options to help you get your job done. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_39.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..94413e1caec60e0452baab36c37bfffac4a0e938 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_39.txt @@ -0,0 +1,5 @@ +For instance, the [transformers.AutomaticSpeechRecognitionPipeline.__call__] method has a return_timestamps parameter which sounds promising for subtitling videos: + +transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True) +transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") +{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_40.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..9195e7350f2706ba4329987727a6ccf42430de51 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_40.txt @@ -0,0 +1 @@ +', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]} \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_41.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e2c7efeb69e5c44944affa6a39ccc9aac09b3a0 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_41.txt @@ -0,0 +1 @@ +As you can see, the model inferred the text and also outputted when the various sentences were pronounced. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_42.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..df4896653e6dee339e0b7797837c0e1244a1f29c --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_42.txt @@ -0,0 +1 @@ +There are many parameters available for each task, so check out each task's API reference to see what you can tinker with! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_43.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb8cd3f83614a5857b7025a875906fa323d39b1e --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_43.txt @@ -0,0 +1,8 @@ +For instance, the [~transformers.AutomaticSpeechRecognitionPipeline] has a chunk_length_s parameter which is helpful +for working on really long audio files (for example, subtitling entire movies or hour-long videos) that a model typically +cannot handle on its own: +thon + +transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30, return_timestamps=True) +transcriber("https://huggingface.co/datasets/sanchit-gandhi/librispeech_long/resolve/main/audio.wav") +{'text': " Chapter 16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_44.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bf0437861c40e2daee6412d201b49726c66866c --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_44.txt @@ -0,0 +1 @@ +I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_45.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa87112cd6464d6eb31ab5ea906e40f06f514e03 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_45.txt @@ -0,0 +1 @@ +I, too, agree to whatever Marguerite wished, Marguerite to be unable to live apart from me. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_46.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..604eb85014464cce6f041c9fb422fcb7bc550cc4 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_46.txt @@ -0,0 +1,3 @@ +It was the day after the evening + +If you can't find a parameter that would really help you out, feel free to request it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_47.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..2709bab63c9c0a3dadbaaa3977cd733e364d89b2 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_47.txt @@ -0,0 +1,2 @@ +Using pipelines on a dataset +The pipeline can also run inference on a large dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_48.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..05a90fb3e5d47b5268b6cb44bde0c58a4a63fd63 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_48.txt @@ -0,0 +1,13 @@ +The easiest way we recommend doing this is by using an iterator: + +def data(): + for i in range(1000): + yield f"My example {i}" +pipe = pipeline(model="openai-community/gpt2", device=0) +generated_characters = 0 +for out in pipe(data()): + generated_characters += len(out[0]["generated_text"]) + +The iterator data() yields each result, and the pipeline automatically +recognizes the input is iterable and will start fetching the data while +it continues to process it on the GPU (this uses DataLoader under the hood). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_49.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ef2828b71a18c3fcfd972a5317c15eac407603b --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_49.txt @@ -0,0 +1,2 @@ +This is important because you don't have to allocate memory for the whole dataset +and you can feed the GPU as fast as possible. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_50.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..43b2a38ad4dc41287555957814993982fbe497de --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_50.txt @@ -0,0 +1 @@ +Since batching could speed things up, it may be useful to try tuning the batch_size parameter here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_51.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..c17973b50309743a7253120599bd4c4aff20ffe7 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_51.txt @@ -0,0 +1,3 @@ +The simplest way to iterate over a dataset is to just load one from 🤗 Datasets: + +KeyDataset is a util that will just output the item we're interested in. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_52.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d7799d1fbd61d45981f070a5d6e335437d921e7 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_52.txt @@ -0,0 +1,11 @@ +from transformers.pipelines.pt_utils import KeyDataset +from datasets import load_dataset +pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0) +dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]") +for out in pipe(KeyDataset(dataset, "audio")): + print(out) + +Using pipelines for a webserver + +Creating an inference engine is a complex topic which deserves it's own +page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_53.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..205012d4d361f28db0f87194648d2938cc4b8d80 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_53.txt @@ -0,0 +1,3 @@ +Link +Vision pipeline +Using a [pipeline] for vision tasks is practically identical. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_54.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..26f25758767c99ae0f98e2701fc26deefd57253f --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_54.txt @@ -0,0 +1 @@ +Specify your task and pass your image to the classifier. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_55.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..fda7914eb7b90f98801520045daffc4ffa898d6e --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_55.txt @@ -0,0 +1 @@ +The image can be a link, a local path or a base64-encoded image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_56.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd4cb46f4790fc5e3a9ada401fb9c877023bea44 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_56.txt @@ -0,0 +1 @@ +For example, what species of cat is shown below? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_57.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..ced7c2ccfc06366726cb65e41abf1e64b7473fe3 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_57.txt @@ -0,0 +1,11 @@ +from transformers import pipeline +vision_classifier = pipeline(model="google/vit-base-patch16-224") +preds = vision_classifier( + images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" + ) +preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +preds +[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}] + +Text pipeline +Using a [pipeline] for NLP tasks is practically identical. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_58.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2de09d77e62e5899e2de3ec9196e1d657f09f6c --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_58.txt @@ -0,0 +1,2 @@ +from transformers import pipeline +This model is a zero-shot-classification model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_59.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f2200bf7ac237ff30029d4e2f0195793f077d66 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_59.txt @@ -0,0 +1,4 @@ +It will classify text, except you are free to choose any label you might imagine +classifier = pipeline(model="facebook/bart-large-mnli") +classifier( + "I have a problem with my iphone that needs to be resolved asap!! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_60.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef2e8ce5ea15ad56f06de9ff9860d53f8c3de6d8 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_60.txt @@ -0,0 +1,4 @@ +", + candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"], + ) +{'sequence': 'I have a problem with my iphone that needs to be resolved asap!! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_61.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..00d46ee421d057588d5860d7fa704d299edc5417 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_61.txt @@ -0,0 +1,4 @@ +', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]} + +Multimodal pipeline +The [pipeline] supports more than one modality. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_62.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..48be3c0a988122c35979ef9bd56b6c0a26410c50 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_62.txt @@ -0,0 +1 @@ +For example, a visual question answering (VQA) task combines text and image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_63.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cef6924079c2f9309c153cf7979e7f1eb9413f0 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_63.txt @@ -0,0 +1 @@ +Feel free to use any image link you like and a question you want to ask about the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_64.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b7e1dca5efb0e46d40991fb5bad4457673673f3 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_64.txt @@ -0,0 +1 @@ +The image can be a URL or a local path to the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_65.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..5361c138699658229ae770787568657961d7b9ad --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_65.txt @@ -0,0 +1,7 @@ +For example, if you use this invoice image: + +from transformers import pipeline +vqa = pipeline(model="impira/layoutlm-document-qa") +vqa( + image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", + question="What is the invoice number? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_66.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..055ef4568a8213c71e43fe7d99aabc03127b2633 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_66.txt @@ -0,0 +1,11 @@ +", + ) +[{'score': 0.42515, 'answer': 'us-001', 'start': 16, 'end': 16}] + +To run the example above you need to have pytesseract installed in addition to 🤗 Transformers: + +sudo apt install -y tesseract-ocr +pip install pytesseract + +Using pipeline on large models with 🤗 accelerate: +You can easily run pipeline on large models using 🤗 accelerate! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_67.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..8815a12d78af32b978833939afcd6540685523bc --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_67.txt @@ -0,0 +1 @@ +First make sure you have installed accelerate with pip install accelerate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_68.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..a17c337d7d0ed3bd78c77bf49a0d405908108bd9 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_68.txt @@ -0,0 +1 @@ +First load your model using device_map="auto"! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_69.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..644f356f692b84c66289743790bf5923a95bf0a7 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_69.txt @@ -0,0 +1 @@ +We will use facebook/opt-1.3b for our example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_70.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..0dc251b46d5af83b74e999bf30e224822bd9d2f4 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_70.txt @@ -0,0 +1,5 @@ +pip install accelerate +import torch +from transformers import pipeline +pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto") +output = pipe("This is a cool example! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_71.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b1d6357f8d600edbae6828e99a949186a5e7dad --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_71.txt @@ -0,0 +1,9 @@ +", do_sample=True, top_p=0.95) + +You can also pass 8-bit loaded models if you install bitsandbytes and add the argument load_in_8bit=True + +pip install accelerate bitsandbytes +import torch +from transformers import pipeline +pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True}) +output = pipe("This is a cool example! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_tutorial/chunk_72.txt b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..8318c5be50feb36a4e5ad788af323d46ad483f29 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_tutorial/chunk_72.txt @@ -0,0 +1,3 @@ +", do_sample=True, top_p=0.95) + +Note that you can replace the checkpoint with any of the Hugging Face model that supports large model loading such as BLOOM! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_11.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..49a9540c786d6929741fc4c07b3b3fc57dd8e817 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_11.txt @@ -0,0 +1,3 @@ +"},] + +And there you go, now you have a good idea of how to create a webserver! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_12.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e47abe1bcf2d9e910ba1eb943a906660cf64064 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_12.txt @@ -0,0 +1,2 @@ +What is really important is that we load the model only once, so there are no copies +of the model on the webserver. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_13.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..dec2ddda941ed91446b0c38c1599be70f97085b3 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_13.txt @@ -0,0 +1 @@ +This way, no unnecessary RAM is being used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_14.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..bced17c292f4e06d57c94844cf57a289b7d32955 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_14.txt @@ -0,0 +1,4 @@ +Then the queuing mechanism allows you to do fancy stuff like maybe accumulating a few +items before inferring to use dynamic batching: + +The code sample below is intentionally written like pseudo-code for readability. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_15.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2aba32b784bb9dc6a09c9cc915781b0204619c6 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_15.txt @@ -0,0 +1 @@ +Do not run this without checking if it makes sense for your system resources! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_16.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..11e77ad050fd1d90928af352c5263fea06c35c94 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_16.txt @@ -0,0 +1,16 @@ +py +(string, rq) = await q.get() +strings = [] +queues = [] +while True: + try: + (string, rq) = await asyncio.wait_for(q.get(), timeout=0.001) # 1ms + except asyncio.exceptions.TimeoutError: + break + strings.append(string) + queues.append(rq) +strings +outs = pipe(strings, batch_size=len(strings)) +for rq, out in zip(queues, outs): + await rq.put(out) +Again, the proposed code is optimized for readability, not for being the best code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_17.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..839fe2528c6f869b0c4102751a0b8d71fca3d327 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_17.txt @@ -0,0 +1,2 @@ +First of all, there's no batch size limit which is usually not a +great idea. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_18.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..fba869c90d2feba1f80322977bf7e41b7cbe0db2 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_18.txt @@ -0,0 +1,3 @@ +Next, the timeout is reset on every queue fetch, meaning you could +wait much more than 1ms before running the inference (delaying the first request +by that much). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_19.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7a5f00a5d198556acc267cf0dd922b426be567c --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_19.txt @@ -0,0 +1 @@ +It would be better to have a single 1ms deadline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_20.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9ac233e0235de24c7f50d1ef056cfa164508931 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_20.txt @@ -0,0 +1,2 @@ +This will always wait for 1ms even if the queue is empty, which might not be the +best since you probably want to start doing inference if there's nothing in the queue. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_21.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..de90793f1d599ad367457b567f4a276cfc4dddd4 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_21.txt @@ -0,0 +1 @@ +But maybe it does make sense if batching is really crucial for your use case. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_22.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..89bc75d28e64cfc31b5e456fcf769006a7731bc2 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_22.txt @@ -0,0 +1 @@ +Again, there's really no one best solution. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_23.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..84f4bb292bb99cf61f9a254147a45daedf7251ef --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_23.txt @@ -0,0 +1,5 @@ +Few things you might want to consider +Error checking +There's a lot that can go wrong in production: out of memory, out of space, +loading the model might fail, the query might be wrong, the query might be +correct but still fail to run because of a model misconfiguration, and so on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_24.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8820ea2995351bbf8f3f05c841065cd518c29b46 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_24.txt @@ -0,0 +1,3 @@ +Generally, it's good if the server outputs the errors to the user, so +adding a lot of try..except statements to show those errors is a good +idea. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_25.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5052adb6b2d5735ffb75b4731b449af53c91dee --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_25.txt @@ -0,0 +1,2 @@ +But keep in mind it may also be a security risk to reveal all those errors depending +on your security context. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_26.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e88f09b4b28661a55dce98c3c3566e8acd31497 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_26.txt @@ -0,0 +1,2 @@ +Circuit breaking +Webservers usually look better when they do circuit breaking. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_27.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbfec86d802701a829c2b058e40900a67297f519 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_27.txt @@ -0,0 +1,2 @@ +It means they +return proper errors when they're overloaded instead of just waiting for the query indefinitely. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_28.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e1c4ed9bd6e4bda58f0d7f83eb58f6787ab412b --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_28.txt @@ -0,0 +1 @@ +Return a 503 error instead of waiting for a super long time or a 504 after a long time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_29.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d55edba6229ba6d296c97ffaf22a85bbaa6516e --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_29.txt @@ -0,0 +1 @@ +This is relatively easy to implement in the proposed code since there is a single queue. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_30.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c97614c0bf0956ca50ebb3225ff34348f54ed65 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_30.txt @@ -0,0 +1,2 @@ +Looking at the queue size is a basic way to start returning errors before your +webserver fails under load. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_31.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1cf153b6fd5eab26d99fd9e8d3fbe8964d76929 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_31.txt @@ -0,0 +1,3 @@ +Blocking the main thread +Currently PyTorch is not async aware, and computation will block the main +thread while running. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_32.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..f19f0b5abdb031be0b692279ecf2f4bba7f99964 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_32.txt @@ -0,0 +1,2 @@ +That means it would be better if PyTorch was forced to run +on its own thread/process. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_33.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..03aa8f4683ef73ab64ca1d758962db3d02077625 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_33.txt @@ -0,0 +1,2 @@ +This wasn't done here because the code is a lot more +complex (mostly because threads and async and queues don't play nice together). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_34.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..a910b4fe392a34035b7cefc853f63203f062a80a --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_34.txt @@ -0,0 +1 @@ +But ultimately it does the same thing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_35.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed6b9d179c503f23eba2808fed2e8482ba602ea4 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_35.txt @@ -0,0 +1,3 @@ +This would be important if the inference of single items were long (> 1s) because +in this case, it means every query during inference would have to wait for 1s before +even receiving an error. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_36.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..905775d468ade61c70bb9e4bbf75ce2a3cb55af2 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_36.txt @@ -0,0 +1,3 @@ +Dynamic batching +In general, batching is not necessarily an improvement over passing 1 item at +a time (see batching details for more information). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_37.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f5c6f21a96425654c9fb4dc7b778274824ced97 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_37.txt @@ -0,0 +1,2 @@ +But it can be very effective +when used in the correct setting. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_38.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..707461a372751abcee4be158f2b29a4ac23fc518 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_38.txt @@ -0,0 +1,2 @@ +In the API, there is no dynamic +batching by default (too much opportunity for a slowdown). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pipeline_webserver/chunk_39.txt b/chunked/content_aware_chunking/_pipeline_webserver/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd062dbe75623341098fe1233e0a7ed1f9bd88f0 --- /dev/null +++ b/chunked/content_aware_chunking/_pipeline_webserver/chunk_39.txt @@ -0,0 +1,2 @@ +But for BLOOM inference - +which is a very large model - dynamic batching is essential to provide a decent experience for everyone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pr_checks/chunk_10.txt b/chunked/content_aware_chunking/_pr_checks/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2df5cf76d994e48c34e82b2c0443eebce81e724 --- /dev/null +++ b/chunked/content_aware_chunking/_pr_checks/chunk_10.txt @@ -0,0 +1 @@ +You can add several patterns separated by a comma. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pr_checks/chunk_11.txt b/chunked/content_aware_chunking/_pr_checks/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..61216e3816b4433f2076827d46d7cb70c9403009 --- /dev/null +++ b/chunked/content_aware_chunking/_pr_checks/chunk_11.txt @@ -0,0 +1 @@ +For instance here CamemberForMaskedLM is a direct copy of RobertaForMaskedLM with two replacements: Roberta to Camembert and ROBERTA to CAMEMBERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pr_checks/chunk_12.txt b/chunked/content_aware_chunking/_pr_checks/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0c1ce9bda3cd4784ca12c0390b5e54836ae91ec --- /dev/null +++ b/chunked/content_aware_chunking/_pr_checks/chunk_12.txt @@ -0,0 +1,5 @@ +You can see here this is done with the comment: + +Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT + +If the order matters (because one of the replacements might conflict with a previous one), the replacements are executed from left to right. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pr_checks/chunk_13.txt b/chunked/content_aware_chunking/_pr_checks/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcd11b59c7eae97096a57350f20724cc29e114f5 --- /dev/null +++ b/chunked/content_aware_chunking/_pr_checks/chunk_13.txt @@ -0,0 +1 @@ +If the replacements change the formatting (if you replace a short name by a very long name for instance), the copy is checked after applying the auto-formatter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pr_checks/chunk_14.txt b/chunked/content_aware_chunking/_pr_checks/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..f617a88889fd8754d8a60d0eb5e792c05a26cd92 --- /dev/null +++ b/chunked/content_aware_chunking/_pr_checks/chunk_14.txt @@ -0,0 +1 @@ +Another way when the patterns are just different casings of the same replacement (with an uppercased and a lowercased variants) is just to add the option all-casing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pr_checks/chunk_15.txt b/chunked/content_aware_chunking/_pr_checks/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..92ef586913f488225ce6f3428eb8cf44da1ba4f5 --- /dev/null +++ b/chunked/content_aware_chunking/_pr_checks/chunk_15.txt @@ -0,0 +1,8 @@ +Here is an example in MobileBertForSequenceClassification with the comment: + +Copied from transformers.models.bert.modeling_bert.BertForSequenceClassification with Bert->MobileBert all-casing + +In this case, the code is copied from BertForSequenceClassification by replacing: +- Bert by MobileBert (for instance when using MobileBertModel in the init) +- bert by mobilebert (for instance when defining self.mobilebert) +- BERT by MOBILEBERT (in the constant MOBILEBERT_INPUTS_DOCSTRING) \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pr_checks/chunk_8.txt b/chunked/content_aware_chunking/_pr_checks/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbabad74c3b66e92a453a253f32365b5ed62274b --- /dev/null +++ b/chunked/content_aware_chunking/_pr_checks/chunk_8.txt @@ -0,0 +1 @@ +This means the code is copied with all instances of foo being replaced by bar. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_pr_checks/chunk_9.txt b/chunked/content_aware_chunking/_pr_checks/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..c05d248409986cb6a747671b0ae77b1f964264f3 --- /dev/null +++ b/chunked/content_aware_chunking/_pr_checks/chunk_9.txt @@ -0,0 +1,5 @@ +You can see how it used here in RobertaAttention with the comment: + +Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->Roberta + +Note that there shouldn't be any spaces around the arrow (unless that space is part of the pattern to replace of course). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_100.txt b/chunked/content_aware_chunking/_preprocessing/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..e17288d6003d477ece50a32f16564d4f4d5746e4 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_100.txt @@ -0,0 +1,3 @@ +Pad +In some cases, for instance, when fine-tuning DETR, the model applies scale augmentation at training +time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_101.txt b/chunked/content_aware_chunking/_preprocessing/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..5abbc34a88966f0128a4cbfdbd7ee877585e9a4f --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_101.txt @@ -0,0 +1 @@ +This may cause images to be different sizes in a batch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_102.txt b/chunked/content_aware_chunking/_preprocessing/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..300330c5c95ffd1297103796b963d634c59c85a1 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_102.txt @@ -0,0 +1,2 @@ +You can use [DetrImageProcessor.pad] +from [DetrImageProcessor] and define a custom collate_fn to batch images together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_103.txt b/chunked/content_aware_chunking/_preprocessing/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..a67e2c5f602ec4da8e9671686caedb1ccffd42fb --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_103.txt @@ -0,0 +1,12 @@ +def collate_fn(batch): + pixel_values = [item["pixel_values"] for item in batch] + encoding = image_processor.pad(pixel_values, return_tensors="pt") + labels = [item["labels"] for item in batch] + batch = {} + batch["pixel_values"] = encoding["pixel_values"] + batch["pixel_mask"] = encoding["pixel_mask"] + batch["labels"] = labels + return batch + +Multimodal +For tasks involving multimodal inputs, you'll need a processor to prepare your dataset for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_104.txt b/chunked/content_aware_chunking/_preprocessing/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..c40fe93306c52197a930e6000ffc828d51950503 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_104.txt @@ -0,0 +1 @@ +A processor couples together two processing objects such as as tokenizer and feature extractor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_105.txt b/chunked/content_aware_chunking/_preprocessing/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4cb138a91c76318ce2ea34c80300843a3a8583c --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_105.txt @@ -0,0 +1,20 @@ +Load the LJ Speech dataset (see the 🤗 Datasets tutorial for more details on how to load a dataset) to see how you can use a processor for automatic speech recognition (ASR): + +from datasets import load_dataset +lj_speech = load_dataset("lj_speech", split="train") + +For ASR, you're mainly focused on audio and text so you can remove the other columns: + +lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"]) + +Now take a look at the audio and text columns: + +lj_speech[0]["audio"] +{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, , + 7.3242188e-04, 2.1362305e-04, 6.1035156e-05], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav', + 'sampling_rate': 22050} +lj_speech[0]["text"] +'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition' + +Remember you should always resample your audio dataset's sampling rate to match the sampling rate of the dataset used to pretrain a model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_106.txt b/chunked/content_aware_chunking/_preprocessing/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ad2895dfcca7edc2c2c2e59e21d533a916e0868 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_106.txt @@ -0,0 +1,8 @@ +lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000)) + +Load a processor with [AutoProcessor.from_pretrained]: + +from transformers import AutoProcessor +processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") + +Create a function to process the audio data contained in array to input_values, and tokenize text to labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_107.txt b/chunked/content_aware_chunking/_preprocessing/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ce0a3921b6d61ec64227fa2e0df6678d8ccb35b --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_107.txt @@ -0,0 +1,13 @@ +These are the inputs to the model: + +def prepare_dataset(example): + audio = example["audio"] + + example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000)) + return example + +Apply the prepare_dataset function to a sample: + +prepare_dataset(lj_speech[0]) + +The processor has now added input_values and labels, and the sampling rate has also been correctly downsampled to 16kHz. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_108.txt b/chunked/content_aware_chunking/_preprocessing/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7061eb9f2d2ee85cdd5a9a3f432c5c04bedb906 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_108.txt @@ -0,0 +1 @@ +You can pass your processed dataset to the model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_32.txt b/chunked/content_aware_chunking/_preprocessing/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1940e4941cca980d3e5c0a9a35484bd4d326b6a --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_32.txt @@ -0,0 +1 @@ +In this case, you'll need to truncate the sequence to a shorter length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_33.txt b/chunked/content_aware_chunking/_preprocessing/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..159a9a89c725ddcb1eaa77ecdf9e804b040e02f4 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_33.txt @@ -0,0 +1,4 @@ +Set the truncation parameter to True to truncate a sequence to the maximum length accepted by the model: + +batch_sentences = [ + "But what about second breakfast? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_34.txt b/chunked/content_aware_chunking/_preprocessing/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4abb85f0a66db0f082bf91d80218f0f643eb096 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_34.txt @@ -0,0 +1,2 @@ +", + "Don't think he knows about second breakfast, Pip. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_35.txt b/chunked/content_aware_chunking/_preprocessing/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c6deabcbae1ed8e2c021759429554fc69c166ed --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_35.txt @@ -0,0 +1,2 @@ +", + "What about elevensies? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_36.txt b/chunked/content_aware_chunking/_preprocessing/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7ba1067bc8cbac8f046ae2d9fa70ad1c14fe199 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_36.txt @@ -0,0 +1,15 @@ +", + ] +encoded_input = tokenizer(batch_sentences, padding=True, truncation=True) +print(encoded_input) +{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]], + 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], + 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]} + +Check out the Padding and truncation concept guide to learn more different padding and truncation arguments. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_37.txt b/chunked/content_aware_chunking/_preprocessing/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff764bb238663b820d53a8c9c53c66fe9d22462c --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_37.txt @@ -0,0 +1,2 @@ +Build tensors +Finally, you want the tokenizer to return the actual tensors that get fed to the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_38.txt b/chunked/content_aware_chunking/_preprocessing/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e7f6317b435fafbfec10d85aff7d19d9bc84b63 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_38.txt @@ -0,0 +1,4 @@ +Set the return_tensors parameter to either pt for PyTorch, or tf for TensorFlow: + +batch_sentences = [ + "But what about second breakfast? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_39.txt b/chunked/content_aware_chunking/_preprocessing/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4abb85f0a66db0f082bf91d80218f0f643eb096 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_39.txt @@ -0,0 +1,2 @@ +", + "Don't think he knows about second breakfast, Pip. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_40.txt b/chunked/content_aware_chunking/_preprocessing/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c6deabcbae1ed8e2c021759429554fc69c166ed --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_40.txt @@ -0,0 +1,2 @@ +", + "What about elevensies? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_41.txt b/chunked/content_aware_chunking/_preprocessing/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..32765809459a67e9f81b9bd417689d699ee35986 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_41.txt @@ -0,0 +1,17 @@ +", + ] +encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt") +print(encoded_input) +{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0], + [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102], + [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]), + 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), + 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0], + [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], + [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])} + +py +batch_sentences = [ + "But what about second breakfast? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_42.txt b/chunked/content_aware_chunking/_preprocessing/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4abb85f0a66db0f082bf91d80218f0f643eb096 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_42.txt @@ -0,0 +1,2 @@ +", + "Don't think he knows about second breakfast, Pip. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_43.txt b/chunked/content_aware_chunking/_preprocessing/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c6deabcbae1ed8e2c021759429554fc69c166ed --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_43.txt @@ -0,0 +1,2 @@ +", + "What about elevensies? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_44.txt b/chunked/content_aware_chunking/_preprocessing/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..cabb035565addd5e52651a2e9df92a2ceb2f4141 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_44.txt @@ -0,0 +1,9 @@ +", + ] +encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf") +print(encoded_input) +{'input_ids': , + 'token_type_ids': , + 'attention_mask': } + +Different pipelines support tokenizer arguments in their __call__() differently. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_45.txt b/chunked/content_aware_chunking/_preprocessing/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..acb3308b876d991f26f02ce5c7ce1c4c25cdd15e --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_45.txt @@ -0,0 +1 @@ +text-2-text-generation pipelines support (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_46.txt b/chunked/content_aware_chunking/_preprocessing/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..424bc9d9e8b40157e4d7ac13799850f235ba194d --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_46.txt @@ -0,0 +1,2 @@ +pass on) +only truncation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_47.txt b/chunked/content_aware_chunking/_preprocessing/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..14a584c16547b22be65e46fae1f784d025f6fb72 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_47.txt @@ -0,0 +1 @@ +text-generation pipelines support max_length, truncation, padding and add_special_tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_48.txt b/chunked/content_aware_chunking/_preprocessing/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..446514b62cd1245d7813bf575ec3a5636fe075dd --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_48.txt @@ -0,0 +1 @@ +In fill-mask pipelines, tokenizer arguments can be passed in the tokenizer_kwargs argument (dictionary). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_49.txt b/chunked/content_aware_chunking/_preprocessing/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..588e8136b57de52e393faeb8bb8ecba74c06936f --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_49.txt @@ -0,0 +1,2 @@ +Audio +For audio tasks, you'll need a feature extractor to prepare your dataset for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_50.txt b/chunked/content_aware_chunking/_preprocessing/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..5038ad9335f75350ffd78b05df06e01f97623a8e --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_50.txt @@ -0,0 +1 @@ +The feature extractor is designed to extract features from raw audio data, and convert them into tensors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_51.txt b/chunked/content_aware_chunking/_preprocessing/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..191586467a31468bc8559e792340b45316528b2b --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_51.txt @@ -0,0 +1,6 @@ +Load the MInDS-14 dataset (see the 🤗 Datasets tutorial for more details on how to load a dataset) to see how you can use a feature extractor with audio datasets: + +from datasets import load_dataset, Audio +dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") + +Access the first element of the audio column to take a look at the input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_52.txt b/chunked/content_aware_chunking/_preprocessing/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..c192be7ca7a9ff628c5e3fcd38c223fa75267226 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_52.txt @@ -0,0 +1,5 @@ +Calling the audio column automatically loads and resamples the audio file: + +dataset[0]["audio"] +{'array': array([ 0. , 0.00024414, -0.00024414, , -0.00024414, + 0. , 0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_53.txt b/chunked/content_aware_chunking/_preprocessing/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..5311144bb37ec257cf962cc00dd94c23b3595ccb --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_53.txt @@ -0,0 +1,7 @@ +], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'sampling_rate': 8000} + +This returns three items: + +array is the speech signal loaded - and potentially resampled - as a 1D array. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_54.txt b/chunked/content_aware_chunking/_preprocessing/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..6272ef8dc496f5de6a0514de4f9e2b02f7a59ccc --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_54.txt @@ -0,0 +1 @@ +path points to the location of the audio file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_55.txt b/chunked/content_aware_chunking/_preprocessing/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4b0f618ac146a845672e3777e4bd475cbe4192a --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_55.txt @@ -0,0 +1 @@ +sampling_rate refers to how many data points in the speech signal are measured per second. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_56.txt b/chunked/content_aware_chunking/_preprocessing/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d5e444000059cc94a5b9f7c897bcb806ac0cabb --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_56.txt @@ -0,0 +1 @@ +For this tutorial, you'll use the Wav2Vec2 model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_57.txt b/chunked/content_aware_chunking/_preprocessing/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..5973cb9e9917e407c8251bd2108dd0a0f9983631 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_57.txt @@ -0,0 +1 @@ +Take a look at the model card, and you'll learn Wav2Vec2 is pretrained on 16kHz sampled speech audio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_58.txt b/chunked/content_aware_chunking/_preprocessing/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c9ce28ccbfa0631c7b2ef659655e35ecfbf22b8 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_58.txt @@ -0,0 +1 @@ +It is important your audio data's sampling rate matches the sampling rate of the dataset used to pretrain the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_59.txt b/chunked/content_aware_chunking/_preprocessing/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..65d1545b92c8cd0f9b7b8a49af29c67614b4e0a3 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_59.txt @@ -0,0 +1 @@ +If your data's sampling rate isn't the same, then you need to resample your data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_60.txt b/chunked/content_aware_chunking/_preprocessing/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..9465c0fa46183fea0800ea9ca44c78453f9f60a0 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_60.txt @@ -0,0 +1,13 @@ +Use 🤗 Datasets' [~datasets.Dataset.cast_column] method to upsample the sampling rate to 16kHz: + +dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000)) + +Call the audio column again to resample the audio file: + +dataset[0]["audio"] +{'array': array([ 2.3443763e-05, 2.1729663e-04, 2.2145823e-04, , + 3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32), + 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav', + 'sampling_rate': 16000} + +Next, load a feature extractor to normalize and pad the input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_61.txt b/chunked/content_aware_chunking/_preprocessing/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..4eb3784604300ee0177c16587ceff76876cc89b9 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_61.txt @@ -0,0 +1 @@ +When padding textual data, a 0 is added for shorter sequences. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_62.txt b/chunked/content_aware_chunking/_preprocessing/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4fa1e7743386c335aa53a5066b6edfc7bc2477a --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_62.txt @@ -0,0 +1 @@ +The same idea applies to audio data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_63.txt b/chunked/content_aware_chunking/_preprocessing/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9a4646b04c3b292c0a3f38047cd2d589204f823 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_63.txt @@ -0,0 +1 @@ +The feature extractor adds a 0 - interpreted as silence - to array. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_64.txt b/chunked/content_aware_chunking/_preprocessing/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..a72e09598cf11508ce015db3fa8bf53bdafbf5a5 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_64.txt @@ -0,0 +1,6 @@ +Load the feature extractor with [AutoFeatureExtractor.from_pretrained]: + +from transformers import AutoFeatureExtractor +feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base") + +Pass the audio array to the feature extractor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_65.txt b/chunked/content_aware_chunking/_preprocessing/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d09325180168a2ce236c6fea4a3edb6051f1824 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_65.txt @@ -0,0 +1 @@ +We also recommend adding the sampling_rate argument in the feature extractor in order to better debug any silent errors that may occur. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_66.txt b/chunked/content_aware_chunking/_preprocessing/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..c254192dbd6f0cab1523f9954f692bf8fce15b31 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_66.txt @@ -0,0 +1,6 @@ +audio_input = [dataset[0]["audio"]["array"]] +feature_extractor(audio_input, sampling_rate=16000) +{'input_values': [array([ 3.8106556e-04, 2.7506407e-03, 2.8015103e-03, , + 5.6335266e-04, 4.6588284e-06, -1.7142107e-04], dtype=float32)]} + +Just like the tokenizer, you can apply padding or truncation to handle variable sequences in a batch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_67.txt b/chunked/content_aware_chunking/_preprocessing/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..c65986f8a5fe86f6d19601a1eacc17e49eb57077 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_67.txt @@ -0,0 +1,8 @@ +Take a look at the sequence length of these two audio samples: + +dataset[0]["audio"]["array"].shape +(173398,) +dataset[1]["audio"]["array"].shape +(106496,) + +Create a function to preprocess the dataset so the audio samples are the same lengths. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_68.txt b/chunked/content_aware_chunking/_preprocessing/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffbf4ab19e87d2535bdc4ecf1f4ca4a8ae9b71a4 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_68.txt @@ -0,0 +1,18 @@ +Specify a maximum sample length, and the feature extractor will either pad or truncate the sequences to match it: + +def preprocess_function(examples): + audio_arrays = [x["array"] for x in examples["audio"]] + inputs = feature_extractor( + audio_arrays, + sampling_rate=16000, + padding=True, + max_length=100000, + truncation=True, + ) + return inputs + +Apply the preprocess_function to the first few examples in the dataset: + +processed_dataset = preprocess_function(dataset[:5]) + +The sample lengths are now the same and match the specified maximum length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_69.txt b/chunked/content_aware_chunking/_preprocessing/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7061eb9f2d2ee85cdd5a9a3f432c5c04bedb906 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_69.txt @@ -0,0 +1 @@ +You can pass your processed dataset to the model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_70.txt b/chunked/content_aware_chunking/_preprocessing/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2518f40f31abad91051801de14652cd99b51728 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_70.txt @@ -0,0 +1,7 @@ +processed_dataset["input_values"][0].shape +(100000,) +processed_dataset["input_values"][1].shape +(100000,) + +Computer vision +For computer vision tasks, you'll need an image processor to prepare your dataset for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_71.txt b/chunked/content_aware_chunking/_preprocessing/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..599a160b3db652c79ceaffd35b3aeb551c28238e --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_71.txt @@ -0,0 +1 @@ +Image preprocessing consists of several steps that convert images into the input expected by the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_72.txt b/chunked/content_aware_chunking/_preprocessing/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dba025b4c55b3b41eb6951a2d1e6828a18678bb --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_72.txt @@ -0,0 +1,2 @@ +These steps +include but are not limited to resizing, normalizing, color channel correction, and converting images to tensors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_73.txt b/chunked/content_aware_chunking/_preprocessing/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ed3a2f28f5ceceeb978a27e8281007c778c666f --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_73.txt @@ -0,0 +1 @@ +Image preprocessing often follows some form of image augmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_74.txt b/chunked/content_aware_chunking/_preprocessing/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..c28c5584efce56799e805b7e7e5713e662f9669d --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_74.txt @@ -0,0 +1,4 @@ +Both image preprocessing and image augmentation +transform image data, but they serve different purposes: + +Image augmentation alters images in a way that can help prevent overfitting and increase the robustness of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_75.txt b/chunked/content_aware_chunking/_preprocessing/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..082738af9f895f50ea12011fe7bebce44b82261a --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_75.txt @@ -0,0 +1 @@ +You can get creative in how you augment your data - adjust brightness and colors, crop, rotate, resize, zoom, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_76.txt b/chunked/content_aware_chunking/_preprocessing/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..2540a531d12bb4066bd3559995390eb341848e7c --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_76.txt @@ -0,0 +1 @@ +However, be mindful not to change the meaning of the images with your augmentations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_77.txt b/chunked/content_aware_chunking/_preprocessing/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..71d0c453e46a0d4f2f5782f4bf985f3c1b62533e --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_77.txt @@ -0,0 +1 @@ +Image preprocessing guarantees that the images match the model’s expected input format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_78.txt b/chunked/content_aware_chunking/_preprocessing/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..1aa807ff5fc30071d45bbf145d81e67a67600604 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_78.txt @@ -0,0 +1 @@ +When fine-tuning a computer vision model, images must be preprocessed exactly as when the model was initially trained. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_79.txt b/chunked/content_aware_chunking/_preprocessing/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..eef75a453412d263e483ea957a3cd54c74ab6b6e --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_79.txt @@ -0,0 +1 @@ +You can use any library you like for image augmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_80.txt b/chunked/content_aware_chunking/_preprocessing/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..12311d28fc0469005646fc019a3cc5b7e5ad454e --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_80.txt @@ -0,0 +1 @@ +For image preprocessing, use the ImageProcessor associated with the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_81.txt b/chunked/content_aware_chunking/_preprocessing/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..802d399d0d0076656392391c98a8053a385f9c47 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_81.txt @@ -0,0 +1,3 @@ +Load the food101 dataset (see the 🤗 Datasets tutorial for more details on how to load a dataset) to see how you can use an image processor with computer vision datasets: + +Use 🤗 Datasets split parameter to only load a small sample from the training split since the dataset is quite large! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_82.txt b/chunked/content_aware_chunking/_preprocessing/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..63dabc35f501877ddec75a1714b8997f35a4317a --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_82.txt @@ -0,0 +1,13 @@ +from datasets import load_dataset +dataset = load_dataset("food101", split="train[:100]") + +Next, take a look at the image with 🤗 Datasets Image feature: + +dataset[0]["image"] + +Load the image processor with [AutoImageProcessor.from_pretrained]: + +from transformers import AutoImageProcessor +image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") + +First, let's add some image augmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_83.txt b/chunked/content_aware_chunking/_preprocessing/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce5b008103aa04a54d32302e0407ca8934d44d17 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_83.txt @@ -0,0 +1 @@ +You can use any library you prefer, but in this tutorial, we'll use torchvision's transforms module. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_84.txt b/chunked/content_aware_chunking/_preprocessing/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3d176cd34a6ceb3f8e34dfc01121edfc5cf4c74 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_84.txt @@ -0,0 +1 @@ +If you're interested in using another data augmentation library, learn how in the Albumentations or Kornia notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_85.txt b/chunked/content_aware_chunking/_preprocessing/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4543eb5cab6a0311199377113a0114a49d09d0d --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_85.txt @@ -0,0 +1,2 @@ +Here we use Compose to chain together a couple of +transforms - RandomResizedCrop and ColorJitter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_86.txt b/chunked/content_aware_chunking/_preprocessing/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..410822e9475e386a072b8641056043a5f2fab495 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_86.txt @@ -0,0 +1 @@ +Note that for resizing, we can get the image size requirements from the image_processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_87.txt b/chunked/content_aware_chunking/_preprocessing/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e8de89e217a021e9cb2f20c7976c014e9091c6b --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_87.txt @@ -0,0 +1,2 @@ +For some models, an exact height and +width are expected, for others only the shortest_edge is defined. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_88.txt b/chunked/content_aware_chunking/_preprocessing/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..029a9771ba0f0155a8558b51b57077d96fd9008d --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_88.txt @@ -0,0 +1,10 @@ +from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose +size = ( + image_processor.size["shortest_edge"] + if "shortest_edge" in image_processor.size + else (image_processor.size["height"], image_processor.size["width"]) + ) +_transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)]) + +The model accepts pixel_values +as its input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_89.txt b/chunked/content_aware_chunking/_preprocessing/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..e66b1507d238896fe7b6c0926041c8612dc18024 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_89.txt @@ -0,0 +1 @@ +ImageProcessor can take care of normalizing the images, and generating appropriate tensors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_90.txt b/chunked/content_aware_chunking/_preprocessing/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..906c7049ab27f93ce87afc2d73b8a5d5ed98fb52 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_90.txt @@ -0,0 +1,9 @@ +Create a function that combines image augmentation and image preprocessing for a batch of images and generates pixel_values: + +def transforms(examples): + images = [_transforms(img.convert("RGB")) for img in examples["image"]] + examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"] + return examples + +In the example above we set do_resize=False because we have already resized the images in the image augmentation transformation, +and leveraged the size attribute from the appropriate image_processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_91.txt b/chunked/content_aware_chunking/_preprocessing/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e786ebecec484efd00fc1c0d0fae86fe6f80d76 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_91.txt @@ -0,0 +1,2 @@ +If you do not resize images during image augmentation, +leave this parameter out. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_92.txt b/chunked/content_aware_chunking/_preprocessing/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0cdb3b1f9f5dd97adc02867fb2e02e1e6b8e2cb --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_92.txt @@ -0,0 +1 @@ +By default, ImageProcessor will handle the resizing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_93.txt b/chunked/content_aware_chunking/_preprocessing/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..aed051ddae262ab0f97ccf52cecc4c3830bc6ecc --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_93.txt @@ -0,0 +1,2 @@ +If you wish to normalize images as a part of the augmentation transformation, use the image_processor.image_mean, +and image_processor.image_std values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_94.txt b/chunked/content_aware_chunking/_preprocessing/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..5200c1e6df361be00ab547ffd0789a3d9ae813be --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_94.txt @@ -0,0 +1,5 @@ +Then use 🤗 Datasets[~datasets.Dataset.set_transform] to apply the transforms on the fly: + +dataset.set_transform(transforms) + +Now when you access the image, you'll notice the image processor has added pixel_values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_95.txt b/chunked/content_aware_chunking/_preprocessing/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7061eb9f2d2ee85cdd5a9a3f432c5c04bedb906 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_95.txt @@ -0,0 +1 @@ +You can pass your processed dataset to the model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_96.txt b/chunked/content_aware_chunking/_preprocessing/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea58deebc6a049a769e1b46e4ee46807ac5bb004 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_96.txt @@ -0,0 +1,3 @@ +dataset[0].keys() + +Here is what the image looks like after the transforms are applied. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_97.txt b/chunked/content_aware_chunking/_preprocessing/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..a186a94ec0c1f80a262335e5d35526fc7981cc9a --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_97.txt @@ -0,0 +1 @@ +The image has been randomly cropped and it's color properties are different. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_98.txt b/chunked/content_aware_chunking/_preprocessing/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..1317f4d4180307627e9390adfbaa421078a14d92 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_98.txt @@ -0,0 +1,7 @@ +import numpy as np +import matplotlib.pyplot as plt +img = dataset[0]["pixel_values"] +plt.imshow(img.permute(1, 2, 0)) + +For tasks like object detection, semantic segmentation, instance segmentation, and panoptic segmentation, ImageProcessor +offers post processing methods. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_preprocessing/chunk_99.txt b/chunked/content_aware_chunking/_preprocessing/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..c04a710f8ef012a7bb6708a34f3b38d006480897 --- /dev/null +++ b/chunked/content_aware_chunking/_preprocessing/chunk_99.txt @@ -0,0 +1,2 @@ +These methods convert model's raw outputs into meaningful predictions such as bounding boxes, +or segmentation maps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_100.txt b/chunked/content_aware_chunking/_quantization/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..d326982590a467211a4ae02842ecc5030b111f2e --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_100.txt @@ -0,0 +1 @@ +This feature performs a second quantization of the already quantized weights to save an addition 0.4 bits/parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_101.txt b/chunked/content_aware_chunking/_quantization/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc8a29239c4678807f108d7a2eab34e71977973b --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_101.txt @@ -0,0 +1 @@ +For example, with nested quantization, you can finetune a Llama-13b model on a 16GB NVIDIA T4 GPU with a sequence length of 1024, a batch size of 1, and enabling gradient accumulation with 4 steps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_102.txt b/chunked/content_aware_chunking/_quantization/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..880bb7e453b477b94b364af3cf7df5f6795cec50 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_102.txt @@ -0,0 +1,9 @@ +from transformers import BitsAndBytesConfig +double_quant_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_use_double_quant=True, +) +model_double_quant = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-13b", quantization_config=double_quant_config) + +Optimum +The Optimum library supports quantization for Intel, Furiosa, ONNX Runtime, GPTQ, and lower-level PyTorch quantization functions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_103.txt b/chunked/content_aware_chunking/_quantization/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..c14799f9797beff771415ff4e026feb7cacac9c5 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_103.txt @@ -0,0 +1 @@ +Consider using Optimum for quantization if you're using specific and optimized hardware like Intel CPUs, Furiosa NPUs or a model accelerator like ONNX Runtime. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_104.txt b/chunked/content_aware_chunking/_quantization/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..866163a3284cb9ce88ac10849a75cc60e98358fe --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_104.txt @@ -0,0 +1,2 @@ +Benchmarks +To compare the speed, throughput, and latency of each quantization scheme, check the following benchmarks obtained from the optimum-benchmark library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_105.txt b/chunked/content_aware_chunking/_quantization/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b21b404d935c3a24bba17a9cf0cf5311f141757 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_105.txt @@ -0,0 +1 @@ +The benchmark was run on a NVIDIA A1000 for the TheBloke/Mistral-7B-v0.1-AWQ and TheBloke/Mistral-7B-v0.1-GPTQ models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_106.txt b/chunked/content_aware_chunking/_quantization/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..00b80949c11ddaa8b47f93182853793178746aab --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_106.txt @@ -0,0 +1 @@ +These were also tested against the bitsandbytes quantization methods as well as a native fp16 model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_107.txt b/chunked/content_aware_chunking/_quantization/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1a2c020a968512396129cb290e358938aaf4eb0 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_107.txt @@ -0,0 +1,9 @@ +forward peak memory/batch size + +generate peak memory/batch size + +generate throughput/batch size + +forward latency/batch size + +The benchmarks indicate AWQ quantization is the fastest for inference, text generation, and has the lowest peak memory for text generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_108.txt b/chunked/content_aware_chunking/_quantization/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4dc1ea1517c7c09e4ec025fa59078107bcafa30 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_108.txt @@ -0,0 +1 @@ +However, AWQ has the largest forward latency per batch size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_109.txt b/chunked/content_aware_chunking/_quantization/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd45dadd813ed313b20bc17b3c709395d860332 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_109.txt @@ -0,0 +1 @@ +For a more detailed discussion about the pros and cons of each quantization method, read the Overview of natively supported quantization schemes in 🤗 Transformers blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_110.txt b/chunked/content_aware_chunking/_quantization/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..5422bc37d78ce61728b62a5b9ac75d133450a71a --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_110.txt @@ -0,0 +1,2 @@ +Fused AWQ modules +The TheBloke/Mistral-7B-OpenOrca-AWQ model was benchmarked with batch_size=1 with and without fused modules. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_111.txt b/chunked/content_aware_chunking/_quantization/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..b439d429b97086233a8ae80fa5a456e6727814f0 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_111.txt @@ -0,0 +1,21 @@ +Unfused module +| Batch Size | Prefill Length | Decode Length | Prefill tokens/s | Decode tokens/s | Memory (VRAM) | +|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------| +| 1 | 32 | 32 | 60.0984 | 38.4537 | 4.50 GB (5.68%) | +| 1 | 64 | 64 | 1333.67 | 31.6604 | 4.50 GB (5.68%) | +| 1 | 128 | 128 | 2434.06 | 31.6272 | 4.50 GB (5.68%) | +| 1 | 256 | 256 | 3072.26 | 38.1731 | 4.50 GB (5.68%) | +| 1 | 512 | 512 | 3184.74 | 31.6819 | 4.59 GB (5.80%) | +| 1 | 1024 | 1024 | 3148.18 | 36.8031 | 4.81 GB (6.07%) | +| 1 | 2048 | 2048 | 2927.33 | 35.2676 | 5.73 GB (7.23%) | +Fused module +| Batch Size | Prefill Length | Decode Length | Prefill tokens/s | Decode tokens/s | Memory (VRAM) | +|-------------:|-----------------:|----------------:|-------------------:|------------------:|:----------------| +| 1 | 32 | 32 | 81.4899 | 80.2569 | 4.00 GB (5.05%) | +| 1 | 64 | 64 | 1756.1 | 106.26 | 4.00 GB (5.05%) | +| 1 | 128 | 128 | 2479.32 | 105.631 | 4.00 GB (5.06%) | +| 1 | 256 | 256 | 1813.6 | 85.7485 | 4.01 GB (5.06%) | +| 1 | 512 | 512 | 2848.9 | 97.701 | 4.11 GB (5.19%) | +| 1 | 1024 | 1024 | 3044.35 | 87.7323 | 4.41 GB (5.57%) | +| 1 | 2048 | 2048 | 2715.11 | 89.4709 | 5.57 GB (7.04%) | +The speed and throughput of fused and unfused modules were also tested with the optimum-benchmark library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_112.txt b/chunked/content_aware_chunking/_quantization/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..d854466d0f7e7704965efc27987d55eb4fb236a1 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_112.txt @@ -0,0 +1,3 @@ +forward peak memory/batch size + +generate throughput/batch size \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_49.txt b/chunked/content_aware_chunking/_quantization/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..170fd70764a28238ed56234a4c46ff2e2a655b7b --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_49.txt @@ -0,0 +1,2 @@ +py +dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."] \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_50.txt b/chunked/content_aware_chunking/_quantization/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..7cd8e74c9e71f527a1b2babf317901263edfd430 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_50.txt @@ -0,0 +1,2 @@ +gptq_config = GPTQConfig(bits=4, dataset=dataset, tokenizer=tokenizer) +Load a model to quantize and pass the gptq_config to the [~AutoModelForCausalLM.from_pretrained] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_51.txt b/chunked/content_aware_chunking/_quantization/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9d8cfd715fd254d9fb577fae8eb5f967a2dd1eb --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_51.txt @@ -0,0 +1 @@ +Set device_map="auto" to automatically offload the model to a CPU to help fit the model in memory, and allow the model modules to be moved between the CPU and GPU for quantization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_52.txt b/chunked/content_aware_chunking/_quantization/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..96c7e79277aa2f9191d25273131d0db873c3b9df --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_52.txt @@ -0,0 +1,3 @@ +py +quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config) +If you're running out of memory because a dataset is too large, disk offloading is not supported. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_53.txt b/chunked/content_aware_chunking/_quantization/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..a24122019aaf500500b6d1d8b3311c0cad0eedd0 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_53.txt @@ -0,0 +1,5 @@ +If this is the case, try passing the max_memory parameter to allocate the amount of memory to use on your device (GPU and CPU): +py +quantized_model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", max_memory={0: "30GiB", 1: "46GiB", "cpu": "30GiB"}, quantization_config=gptq_config) + +Depending on your hardware, it can take some time to quantize a model from scratch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_54.txt b/chunked/content_aware_chunking/_quantization/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9087640f05a288dd7ceda096c0debf71b143e40 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_54.txt @@ -0,0 +1 @@ +It can take ~5 minutes to quantize the facebook/opt-350m model on a free-tier Google Colab GPU, but it'll take ~4 hours to quantize a 175B parameter model on a NVIDIA A100. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_55.txt b/chunked/content_aware_chunking/_quantization/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..191f00971551844e0600a8b7589810e79ccf4325 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_55.txt @@ -0,0 +1 @@ +Before you quantize a model, it is a good idea to check the Hub if a GPTQ-quantized version of the model already exists. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_56.txt b/chunked/content_aware_chunking/_quantization/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bbe4fe70881abdadaccbabd134f5ec98f4b2dc9 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_56.txt @@ -0,0 +1 @@ +Once your model is quantized, you can push the model and tokenizer to the Hub where it can be easily shared and accessed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_57.txt b/chunked/content_aware_chunking/_quantization/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..a143314a6d793f2ff986ad6102bf4e4ca4397fca --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_57.txt @@ -0,0 +1,5 @@ +Use the [~PreTrainedModel.push_to_hub] method to save the [GPTQConfig]: +py +quantized_model.push_to_hub("opt-125m-gptq") +tokenizer.push_to_hub("opt-125m-gptq") +You could also save your quantized model locally with the [~PreTrainedModel.save_pretrained] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_58.txt b/chunked/content_aware_chunking/_quantization/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ab80a708feb6b0e3429f1f451eb166ad985ae56 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_58.txt @@ -0,0 +1 @@ +If the model was quantized with the device_map parameter, make sure to move the entire model to a GPU or CPU before saving it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_59.txt b/chunked/content_aware_chunking/_quantization/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb33fd985cd1d7e8879d90be70da5646059ccd87 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_59.txt @@ -0,0 +1,9 @@ +For example, to save the model on a CPU: + +quantized_model.save_pretrained("opt-125m-gptq") +tokenizer.save_pretrained("opt-125m-gptq") +if quantized with device_map set +quantized_model.to("cpu") +quantized_model.save_pretrained("opt-125m-gptq") + +Reload a quantized model with the [~PreTrainedModel.from_pretrained] method, and set device_map="auto" to automatically distribute the model on all available GPUs to load the model faster without using more memory than needed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_60.txt b/chunked/content_aware_chunking/_quantization/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a60a1926197c85f608f16e3b777ac9acb1eadc2 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_60.txt @@ -0,0 +1,5 @@ +from transformers import AutoModelForCausalLM +model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto") + +ExLlama +ExLlama is a Python/C++/CUDA implementation of the Llama model that is designed for faster inference with 4-bit GPTQ weights (check out these benchmarks). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_61.txt b/chunked/content_aware_chunking/_quantization/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..139051f3d9e0ec06b0936d094a3209aca3c9b8ff --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_61.txt @@ -0,0 +1 @@ +The ExLlama kernel is activated by default when you create a [GPTQConfig] object. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_62.txt b/chunked/content_aware_chunking/_quantization/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ea9cc66f8f4226e7b1b49292c621f30e526ece8 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_62.txt @@ -0,0 +1,8 @@ +To boost inference speed even further, use the ExLlamaV2 kernels by configuring the exllama_config parameter: + +import torch +from transformers import AutoModelForCausalLM, GPTQConfig +gptq_config = GPTQConfig(bits=4, exllama_config={"version":2}) +model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config) + +Only 4-bit models are supported, and we recommend deactivating the ExLlama kernels if you're finetuning a quantized model with PEFT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_63.txt b/chunked/content_aware_chunking/_quantization/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..61f6100f86c880d1618170955045a01a573039fc --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_63.txt @@ -0,0 +1 @@ +The ExLlama kernels are only supported when the entire model is on the GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_64.txt b/chunked/content_aware_chunking/_quantization/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..3775301ef8d70c6107b81f7b3a7d390a9011918f --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_64.txt @@ -0,0 +1 @@ +If you're doing inference on a CPU with AutoGPTQ (version > 0.4.2), then you'll need to disable the ExLlama kernel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_65.txt b/chunked/content_aware_chunking/_quantization/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2f2cf20f70246e054610f4aec7dd00a91502ad8 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_65.txt @@ -0,0 +1 @@ +This overwrites the attributes related to the ExLlama kernels in the quantization config of the config.json file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_66.txt b/chunked/content_aware_chunking/_quantization/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..50c0c9bed40cc4b9d43e907acbd31e7b682a5813 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_66.txt @@ -0,0 +1,7 @@ +py +import torch +from transformers import AutoModelForCausalLM, GPTQConfig +gptq_config = GPTQConfig(bits=4, use_exllama=False) +model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="cpu", quantization_config=gptq_config) +bitsandbytes +bitsandbytes is the easiest option for quantizing a model to 8 and 4-bit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_67.txt b/chunked/content_aware_chunking/_quantization/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..7cb09f31d3aa1ca9bee9d99c35368a14f38f6305 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_67.txt @@ -0,0 +1 @@ +8-bit quantization multiplies outliers in fp16 with non-outliers in int8, converts the non-outlier values back to fp16, and then adds them together to return the weights in fp16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_68.txt b/chunked/content_aware_chunking/_quantization/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b73006bf88372960fc9c24cd30792a5f226ae45 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_68.txt @@ -0,0 +1 @@ +This reduces the degradative effect outlier values have on a model's performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_69.txt b/chunked/content_aware_chunking/_quantization/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ae117c3d7ac1cb35a9ee49c6b1f03ed8ab8ea38 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_69.txt @@ -0,0 +1 @@ +4-bit quantization compresses a model even further, and it is commonly used with QLoRA to finetune quantized LLMs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_70.txt b/chunked/content_aware_chunking/_quantization/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..ceb8d642cdcf0401331b3ce060494a69e30530f3 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_70.txt @@ -0,0 +1,9 @@ +To use bitsandbytes, make sure you have the following libraries installed: + +pip install transformers accelerate bitsandbytes>0.37.0 + +pip install bitsandbytes>=0.39.0 +pip install --upgrade accelerate +pip install --upgrade transformers + +Now you can quantize a model with the load_in_8bit or load_in_4bit parameters in the [~PreTrainedModel.from_pretrained] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_71.txt b/chunked/content_aware_chunking/_quantization/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..21d0df1d33da8c4ce813394d6919482dec72ca8d --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_71.txt @@ -0,0 +1 @@ +This works for any model in any modality, as long as it supports loading with Accelerate and contains torch.nn.Linear layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_72.txt b/chunked/content_aware_chunking/_quantization/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..1795f1b28367fa43564bb5f4a352a3b30c72d2e9 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_72.txt @@ -0,0 +1,6 @@ +Quantizing a model in 8-bit halves the memory-usage, and for large models, set device_map="auto" to efficiently use the GPUs available: + +from transformers import AutoModelForCausalLM +model_8bit = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b7", device_map="auto", load_in_8bit=True) + +By default, all the other modules such as torch.nn.LayerNorm are converted to torch.float16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_73.txt b/chunked/content_aware_chunking/_quantization/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..91e90cbf478e37a77ba9537994ee4c18b0c838b0 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_73.txt @@ -0,0 +1,8 @@ +You can change the data type of these modules with the torch_dtype parameter if you want: + +import torch +from transformers import AutoModelForCausalLM +model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True, torch_dtype=torch.float32) +model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype + +Once a model is quantized to 8-bit, you can't push the quantized weights to the Hub unless you're using the latest version of Transformers and bitsandbytes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_74.txt b/chunked/content_aware_chunking/_quantization/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..4802c0565c3075e89354ae253295c1009b222bfc --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_74.txt @@ -0,0 +1 @@ +If you have the latest versions, then you can push the 8-bit model to the Hub with the [~PreTrainedModel.push_to_hub] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_75.txt b/chunked/content_aware_chunking/_quantization/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..26090942f3ee68266d10888106cfd8e019d259e5 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_75.txt @@ -0,0 +1 @@ +The quantization config.json file is pushed first, followed by the quantized model weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_76.txt b/chunked/content_aware_chunking/_quantization/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..668c19937f0d7fce9e46e486d5065bbe5dd04bb2 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_76.txt @@ -0,0 +1,11 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer +model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True) +tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m") +model.push_to_hub("bloom-560m-8bit") + +Quantizing a model in 4-bit reduces your memory-usage by 4x, and for large models, set device_map="auto" to efficiently use the GPUs available: + +from transformers import AutoModelForCausalLM +model_4bit = AutoModelForCausalLM.from_pretrained("bigscience/bloom-1b7", device_map="auto", load_in_4bit=True) + +By default, all the other modules such as torch.nn.LayerNorm are converted to torch.float16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_77.txt b/chunked/content_aware_chunking/_quantization/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..066713d7d182ffdeb51d3bf12531217129d5ccd5 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_77.txt @@ -0,0 +1,8 @@ +You can change the data type of these modules with the torch_dtype parameter if you want: + +import torch +from transformers import AutoModelForCausalLM +model_4bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_4bit=True, torch_dtype=torch.float32) +model_4bit.model.decoder.layers[-1].final_layer_norm.weight.dtype + +If you have bitsandbytes>=0.41.3, you can serialize 4-bit models and push them on Hugging Face Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_78.txt b/chunked/content_aware_chunking/_quantization/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..95f0387e57285ba2fa38144d39061ae23e0e0ef9 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_78.txt @@ -0,0 +1 @@ +Simply call model.push_to_hub() after loading it in 4-bit precision. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_79.txt b/chunked/content_aware_chunking/_quantization/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..361fd1cbd108f4cafd2d166cac31795b5fd84c4c --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_79.txt @@ -0,0 +1 @@ +You can also save the serialized 4-bit models locally with model.save_pretrained() command. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_80.txt b/chunked/content_aware_chunking/_quantization/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..98cd806e368aefbc096b6dea046e5cb4131bf703 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_80.txt @@ -0,0 +1 @@ +Training with 8-bit and 4-bit weights are only supported for training extra parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_81.txt b/chunked/content_aware_chunking/_quantization/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a062831db5d1b378d924e5229210215c4c554e2 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_81.txt @@ -0,0 +1,11 @@ +You can check your memory footprint with the get_memory_footprint method: +py +print(model.get_memory_footprint()) +Quantized models can be loaded from the [~PreTrainedModel.from_pretrained] method without needing to specify the load_in_8bit or load_in_4bit parameters: + +from transformers import AutoModelForCausalLM, AutoTokenizer +model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto") + +8-bit + +Learn more about the details of 8-bit quantization in this blog post! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_82.txt b/chunked/content_aware_chunking/_quantization/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd48c49b8fdbb79d19ca2d3dd0d88ac2cf09a9bf --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_82.txt @@ -0,0 +1 @@ +This section explores some of the specific features of 8-bit models, such as offloading, outlier thresholds, skipping module conversion, and finetuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_83.txt b/chunked/content_aware_chunking/_quantization/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffd8acec2ae739f719399a0c61be14fb1ab482e2 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_83.txt @@ -0,0 +1,2 @@ +Offloading +8-bit models can offload weights between the CPU and GPU to support fitting very large models into memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_84.txt b/chunked/content_aware_chunking/_quantization/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..83c909f177898cccf754c1c58d2661ec3ddda032 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_84.txt @@ -0,0 +1 @@ +The weights dispatched to the CPU are actually stored in float32, and aren't converted to 8-bit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_85.txt b/chunked/content_aware_chunking/_quantization/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..8785c2e6d3c74328c6bf163c3e58f4bf36faa137 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_85.txt @@ -0,0 +1,23 @@ +For example, to enable offloading for the bigscience/bloom-1b7 model, start by creating a [BitsAndBytesConfig]: + +from transformers import AutoModelForCausalLM, BitsAndBytesConfig +quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True) + +Design a custom device map to fit everything on your GPU except for the lm_head, which you'll dispatch to the CPU: +py +device_map = { + "transformer.word_embeddings": 0, + "transformer.word_embeddings_layernorm": 0, + "lm_head": "cpu", + "transformer.h": 0, + "transformer.ln_f": 0, +} +Now load your model with the custom device_map and quantization_config: +py +model_8bit = AutoModelForCausalLM.from_pretrained( + "bigscience/bloom-1b7", + device_map=device_map, + quantization_config=quantization_config, +) +Outlier threshold +An "outlier" is a hidden state value greater than a certain threshold, and these values are computed in fp16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_86.txt b/chunked/content_aware_chunking/_quantization/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e4f4652877102904bb93b42afceb3f544d9e83b --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_86.txt @@ -0,0 +1 @@ +While the values are usually normally distributed ([-3.5, 3.5]), this distribution can be very different for large models ([-60, 6] or [6, 60]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_87.txt b/chunked/content_aware_chunking/_quantization/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..0371f545c58c52ab71652e0bff5acca22825e863 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_87.txt @@ -0,0 +1 @@ +8-bit quantization works well for values ~5, but beyond that, there is a significant performance penalty. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_88.txt b/chunked/content_aware_chunking/_quantization/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2ebadcdeb04e31c0bf6aa7376f4088e2827cf2f --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_88.txt @@ -0,0 +1 @@ +A good default threshold value is 6, but a lower threshold may be needed for more unstable models (small models or finetuning). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_89.txt b/chunked/content_aware_chunking/_quantization/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb7ae5fccee01c4e8e90fe189acebe6473afbac7 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_89.txt @@ -0,0 +1,15 @@ +To find the best threshold for your model, we recommend experimenting with the llm_int8_threshold parameter in [BitsAndBytesConfig]: + +from transformers import AutoModelForCausalLM, BitsAndBytesConfig +model_id = "bigscience/bloom-1b7" +quantization_config = BitsAndBytesConfig( + llm_int8_threshold=10, +) +model_8bit = AutoModelForCausalLM.from_pretrained( + model_id, + device_map=device_map, + quantization_config=quantization_config, +) + +Skip module conversion +For some models, like Jukebox, you don't need to quantize every module to 8-bit which can actually cause instability. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_90.txt b/chunked/content_aware_chunking/_quantization/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e23c65c011442545a9806291ff397b75acb3bd4 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_90.txt @@ -0,0 +1,15 @@ +With Jukebox, there are several lm_head modules that should be skipped using the llm_int8_skip_modules parameter in [BitsAndBytesConfig]: + +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +model_id = "bigscience/bloom-1b7" +quantization_config = BitsAndBytesConfig( + llm_int8_skip_modules=["lm_head"], +) +model_8bit = AutoModelForCausalLM.from_pretrained( + model_id, + device_map="auto", + quantization_config=quantization_config, +) + +Finetuning +With the PEFT library, you can finetune large models like flan-t5-large and facebook/opt-6.7b with 8-bit quantization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_91.txt b/chunked/content_aware_chunking/_quantization/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..b07a906eab05793272c53d247687ca96b8cb5432 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_91.txt @@ -0,0 +1 @@ +You don't need to pass the device_map parameter for training because it'll automatically load your model on a GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_92.txt b/chunked/content_aware_chunking/_quantization/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf94dc34543069dddf33cfc01b08253453cdc12e --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_92.txt @@ -0,0 +1 @@ +However, you can still customize the device map with the device_map parameter if you want to (device_map="auto" should only be used for inference). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_93.txt b/chunked/content_aware_chunking/_quantization/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..09644a3f9398fbe07554038f46c33e9581875a9c --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_93.txt @@ -0,0 +1,3 @@ +4-bit + +Try 4-bit quantization in this notebook and learn more about it's details in this blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_94.txt b/chunked/content_aware_chunking/_quantization/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..e66f528089b3bcaf426a72f2fd55d6a6bcff08c0 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_94.txt @@ -0,0 +1 @@ +This section explores some of the specific features of 4-bit models, such as changing the compute data type, using the Normal Float 4 (NF4) data type, and using nested quantization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_95.txt b/chunked/content_aware_chunking/_quantization/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c8065aff408252f404fb2c79b4858be63781952 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_95.txt @@ -0,0 +1,9 @@ +Compute data type +To speedup computation, you can change the data type from float32 (the default value) to bf16 using the bnb_4bit_compute_dtype parameter in [BitsAndBytesConfig]: + +import torch +from transformers import BitsAndBytesConfig +quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16) + +Normal Float 4 (NF4) +NF4 is a 4-bit data type from the QLoRA paper, adapted for weights initialized from a normal distribution. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_96.txt b/chunked/content_aware_chunking/_quantization/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..00a3b7fac9ee686feead3c7d7670dd325a2f3ae3 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_96.txt @@ -0,0 +1 @@ +You should use NF4 for training 4-bit base models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_97.txt b/chunked/content_aware_chunking/_quantization/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..51bb07fa7941ad0308b5675fa2fb148eca0d7f44 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_97.txt @@ -0,0 +1,10 @@ +This can be configured with the bnb_4bit_quant_type parameter in the [BitsAndBytesConfig]: + +from transformers import BitsAndBytesConfig +nf4_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", +) +model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config) + +For inference, the bnb_4bit_quant_type does not have a huge impact on performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_98.txt b/chunked/content_aware_chunking/_quantization/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..45c04cad11bf8b21a665225d750ca1e6562d12c1 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_98.txt @@ -0,0 +1 @@ +However, to remain consistent with the model weights, you should use the bnb_4bit_compute_dtype and torch_dtype values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quantization/chunk_99.txt b/chunked/content_aware_chunking/_quantization/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb91fc15aa38c66e11df7c92dbba09cfc36325e2 --- /dev/null +++ b/chunked/content_aware_chunking/_quantization/chunk_99.txt @@ -0,0 +1,2 @@ +Nested quantization +Nested quantization is a technique that can save additional memory at no additional performance cost. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_32.txt b/chunked/content_aware_chunking/_quicktour/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..f97bc97e0ba42c8635711ab76b4382e4f12ac25a --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_32.txt @@ -0,0 +1 @@ +", "We hope you don't hate it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_33.txt b/chunked/content_aware_chunking/_quicktour/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..d95a37249f49bd9c9de82e399e5627394c9cf6fa --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_33.txt @@ -0,0 +1,9 @@ +"], + padding=True, + truncation=True, + max_length=512, + return_tensors="pt", + ) + +tf_batch = tokenizer( + ["We are very happy to show you the 🤗 Transformers library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_34.txt b/chunked/content_aware_chunking/_quicktour/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..f97bc97e0ba42c8635711ab76b4382e4f12ac25a --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_34.txt @@ -0,0 +1 @@ +", "We hope you don't hate it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_35.txt b/chunked/content_aware_chunking/_quicktour/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..c85230b6963ac77d542d8d3b4d7e5a2cc7857d94 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_35.txt @@ -0,0 +1,8 @@ +"], + padding=True, + truncation=True, + max_length=512, + return_tensors="tf", + ) + +Check out the preprocess tutorial for more details about tokenization, and how to use an [AutoImageProcessor], [AutoFeatureExtractor] and [AutoProcessor] to preprocess image, audio, and multimodal inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_36.txt b/chunked/content_aware_chunking/_quicktour/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b249d6e7595c1f40241c7a227ec4b39479721f6 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_36.txt @@ -0,0 +1,3 @@ +AutoModel + +🤗 Transformers provides a simple and unified way to load pretrained instances. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_37.txt b/chunked/content_aware_chunking/_quicktour/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..acca07c19faf931d48fa761073df274a8e70908e --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_37.txt @@ -0,0 +1 @@ +This means you can load an [AutoModel] like you would load an [AutoTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_38.txt b/chunked/content_aware_chunking/_quicktour/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d2684e98d926c6262360114960ad2d68188ded8 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_38.txt @@ -0,0 +1 @@ +The only difference is selecting the correct [AutoModel] for the task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_39.txt b/chunked/content_aware_chunking/_quicktour/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..07ac71fa453a8fc69759df42da752196f4004e16 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_39.txt @@ -0,0 +1,7 @@ +For text (or sequence) classification, you should load [AutoModelForSequenceClassification]: + +from transformers import AutoModelForSequenceClassification +model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +pt_model = AutoModelForSequenceClassification.from_pretrained(model_name) + +See the task summary for tasks supported by an [AutoModel] class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_40.txt b/chunked/content_aware_chunking/_quicktour/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..b22784b89f895aceeaa5713f7382b84dd76e3975 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_40.txt @@ -0,0 +1 @@ +Now pass your preprocessed batch of inputs directly to the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_41.txt b/chunked/content_aware_chunking/_quicktour/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7f5e6544989a044d59fb24eb5de0564f0be30da --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_41.txt @@ -0,0 +1,5 @@ +You just have to unpack the dictionary by adding **: + +pt_outputs = pt_model(**pt_batch) + +The model outputs the final activations in the logits attribute. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_42.txt b/chunked/content_aware_chunking/_quicktour/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f20c1ca5d484cec3c35642d3ecadf0ade308160 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_42.txt @@ -0,0 +1,11 @@ +Apply the softmax function to the logits to retrieve the probabilities: + +from torch import nn +pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1) +print(pt_predictions) +tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725], + [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=) +`` + + +🤗 Transformers provides a simple and unified way to load pretrained instances. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_43.txt b/chunked/content_aware_chunking/_quicktour/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..197f931c38a636f2ca417e13a8bf1eb83888a52f --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_43.txt @@ -0,0 +1 @@ +This means you can load an [TFAutoModel] like you would load an [AutoTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_44.txt b/chunked/content_aware_chunking/_quicktour/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2dfa503e1810c0b9da7fb0202c45ac1bab38409 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_44.txt @@ -0,0 +1 @@ +The only difference is selecting the correct [TFAutoModel] for the task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_45.txt b/chunked/content_aware_chunking/_quicktour/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f2a0a572f1f2e04f29a3d455de2ea830ba781f4 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_45.txt @@ -0,0 +1,7 @@ +For text (or sequence) classification, you should load [TFAutoModelForSequenceClassification`]: + +from transformers import TFAutoModelForSequenceClassification +model_name = "nlptown/bert-base-multilingual-uncased-sentiment" +tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name) + +See the task summary for tasks supported by an [AutoModel] class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_46.txt b/chunked/content_aware_chunking/_quicktour/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..b22784b89f895aceeaa5713f7382b84dd76e3975 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_46.txt @@ -0,0 +1 @@ +Now pass your preprocessed batch of inputs directly to the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_47.txt b/chunked/content_aware_chunking/_quicktour/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8b42a989e424c1ec0d237366ba7843ea0bc6cfd --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_47.txt @@ -0,0 +1,5 @@ +You can pass the tensors as-is: + +tf_outputs = tf_model(tf_batch) + +The model outputs the final activations in the logits attribute. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_48.txt b/chunked/content_aware_chunking/_quicktour/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..05afb0dc414b83fbdfdbd9c02423a2482c8eaadd --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_48.txt @@ -0,0 +1,8 @@ +Apply the softmax function to the logits to retrieve the probabilities: + +import tensorflow as tf +tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1) +tf_predictions # doctest: +IGNORE_RESULT + +All 🤗 Transformers models (PyTorch or TensorFlow) output the tensors before the final activation +function (like softmax) because the final activation function is often fused with the loss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_49.txt b/chunked/content_aware_chunking/_quicktour/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..cde28776125ab0b4538a0da9855c000e4b6234ef --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_49.txt @@ -0,0 +1 @@ +Model outputs are special dataclasses so their attributes are autocompleted in an IDE. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_50.txt b/chunked/content_aware_chunking/_quicktour/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..2022768cc952b5f9a7a2083f8563b2c1634fd0e7 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_50.txt @@ -0,0 +1 @@ +The model outputs behave like a tuple or a dictionary (you can index with an integer, a slice or a string) in which case, attributes that are None are ignored. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_51.txt b/chunked/content_aware_chunking/_quicktour/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..49d2db5fd404bf2529d14fbd8495c95626c08810 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_51.txt @@ -0,0 +1,25 @@ +Save a model + +Once your model is fine-tuned, you can save it with its tokenizer using [PreTrainedModel.save_pretrained]: + +pt_save_directory = "./pt_save_pretrained" +tokenizer.save_pretrained(pt_save_directory) # doctest: +IGNORE_RESULT +pt_model.save_pretrained(pt_save_directory) + +When you are ready to use the model again, reload it with [PreTrainedModel.from_pretrained]: + +pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained") +`` + + +Once your model is fine-tuned, you can save it with its tokenizer using [TFPreTrainedModel.save_pretrained`]: + +tf_save_directory = "./tf_save_pretrained" +tokenizer.save_pretrained(tf_save_directory) # doctest: +IGNORE_RESULT +tf_model.save_pretrained(tf_save_directory) + +When you are ready to use the model again, reload it with [TFPreTrainedModel.from_pretrained]: + +tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained") + +One particularly cool 🤗 Transformers feature is the ability to save a model and reload it as either a PyTorch or TensorFlow model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_52.txt b/chunked/content_aware_chunking/_quicktour/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..2995516b5526992ff02776dcfa40ccfa0df349e1 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_52.txt @@ -0,0 +1,12 @@ +The from_pt or from_tf parameter can convert the model from one framework to the other: + +from transformers import AutoModel +tokenizer = AutoTokenizer.from_pretrained(tf_save_directory) +pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True) + +from transformers import TFAutoModel +tokenizer = AutoTokenizer.from_pretrained(pt_save_directory) +tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True) + +Custom model builds +You can modify the model's configuration class to change how a model is built. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_53.txt b/chunked/content_aware_chunking/_quicktour/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b25f23c189df22ebd51d83b42b9b1d06b4616fb --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_53.txt @@ -0,0 +1 @@ +The configuration specifies a model's attributes, such as the number of hidden layers or attention heads. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_54.txt b/chunked/content_aware_chunking/_quicktour/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb45515bd1e16940570d72893c99b16515b0984d --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_54.txt @@ -0,0 +1 @@ +You start from scratch when you initialize a model from a custom configuration class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_55.txt b/chunked/content_aware_chunking/_quicktour/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..c14f0f3e23a425299b65ea695e98672fdfa185ba --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_55.txt @@ -0,0 +1 @@ +The model attributes are randomly initialized, and you'll need to train the model before you can use it to get meaningful results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_56.txt b/chunked/content_aware_chunking/_quicktour/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..9125f75b5edcc992707eab0ef2d8822be0d83060 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_56.txt @@ -0,0 +1 @@ +Start by importing [AutoConfig], and then load the pretrained model you want to modify. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_57.txt b/chunked/content_aware_chunking/_quicktour/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..5bf8d0b3772f306d317674ad49fb8b2cb8065a77 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_57.txt @@ -0,0 +1,18 @@ +Within [AutoConfig.from_pretrained], you can specify the attribute you want to change, such as the number of attention heads: + +from transformers import AutoConfig +my_config = AutoConfig.from_pretrained("distilbert/distilbert-base-uncased", n_heads=12) + +Create a model from your custom configuration with [AutoModel.from_config]: + +from transformers import AutoModel +my_model = AutoModel.from_config(my_config) +`` + + +Create a model from your custom configuration with [TFAutoModel.from_config`]: + +from transformers import TFAutoModel +my_model = TFAutoModel.from_config(my_config) + +Take a look at the Create a custom architecture guide for more information about building custom configurations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_58.txt b/chunked/content_aware_chunking/_quicktour/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b4ac43d1926f5701c178814eb11264d722d2050 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_58.txt @@ -0,0 +1,2 @@ +Trainer - a PyTorch optimized training loop +All models are a standard torch.nn.Module so you can use them in any typical training loop. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_59.txt b/chunked/content_aware_chunking/_quicktour/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..036eb867be540733c6774b39ab25477d5827266b --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_59.txt @@ -0,0 +1 @@ +While you can write your own training loop, 🤗 Transformers provides a [Trainer] class for PyTorch, which contains the basic training loop and adds additional functionality for features like distributed training, mixed precision, and more. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_60.txt b/chunked/content_aware_chunking/_quicktour/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b7f158e61caf038e367f1c83000770db75a8726 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_60.txt @@ -0,0 +1,9 @@ +Depending on your task, you'll typically pass the following parameters to [Trainer]: + +You'll start with a [PreTrainedModel] or a torch.nn.Module: + +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + + +[TrainingArguments] contains the model hyperparameters you can change like learning rate, batch size, and the number of epochs to train for. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_61.txt b/chunked/content_aware_chunking/_quicktour/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7f58a2cd1d3db4cff1ee11b8dc919daeba1fbd5 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_61.txt @@ -0,0 +1,58 @@ +The default values are used if you don't specify any training arguments: + +from transformers import TrainingArguments +training_args = TrainingArguments( + output_dir="path/to/save/folder/", + learning_rate=2e-5, + per_device_train_batch_size=8, + per_device_eval_batch_size=8, + num_train_epochs=2, + ) + + +Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") + + +Load a dataset: + +from datasets import load_dataset +dataset = load_dataset("rotten_tomatoes") # doctest: +IGNORE_RESULT + + +Create a function to tokenize the dataset: + +def tokenize_dataset(dataset): + return tokenizer(dataset["text"]) + + +Then apply it over the entire dataset with [~datasets.Dataset.map]: + +dataset = dataset.map(tokenize_dataset, batched=True) + + +A [DataCollatorWithPadding] to create a batch of examples from your dataset: + +from transformers import DataCollatorWithPadding +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + + +Now gather all these classes in [Trainer]: + +from transformers import Trainer +trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], + tokenizer=tokenizer, + data_collator=data_collator, + ) # doctest: +SKIP + +When you're ready, call [~Trainer.train] to start training: + +trainer.train() # doctest: +SKIP + +For tasks - like translation or summarization - that use a sequence-to-sequence model, use the [Seq2SeqTrainer] and [Seq2SeqTrainingArguments] classes instead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_62.txt b/chunked/content_aware_chunking/_quicktour/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..e859205c833e7a8f9f0e776c1ff62452fb77a5b8 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_62.txt @@ -0,0 +1 @@ +You can customize the training loop behavior by subclassing the methods inside [Trainer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_63.txt b/chunked/content_aware_chunking/_quicktour/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..459e575cc44cdf564a52c033eb749f639e32d120 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_63.txt @@ -0,0 +1 @@ +This allows you to customize features such as the loss function, optimizer, and scheduler. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_64.txt b/chunked/content_aware_chunking/_quicktour/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e57eeca9a8ef9b32bb283cd784bf0aaecb2c743 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_64.txt @@ -0,0 +1 @@ +Take a look at the [Trainer] reference for which methods can be subclassed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_65.txt b/chunked/content_aware_chunking/_quicktour/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..acda56dd10be73a0d4eac8137a842a60c7d0f96b --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_65.txt @@ -0,0 +1 @@ +The other way to customize the training loop is by using Callbacks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_66.txt b/chunked/content_aware_chunking/_quicktour/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..114a5f60318fa0c5c77687e49f369c26f431fa33 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_66.txt @@ -0,0 +1 @@ +You can use callbacks to integrate with other libraries and inspect the training loop to report on progress or stop the training early. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_67.txt b/chunked/content_aware_chunking/_quicktour/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7e1c52886b783c13894f906883ac64518232fde --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_67.txt @@ -0,0 +1 @@ +Callbacks do not modify anything in the training loop itself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_68.txt b/chunked/content_aware_chunking/_quicktour/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..f75563f06c1cffd5e450f958c4aaadd47c78c884 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_68.txt @@ -0,0 +1 @@ +To customize something like the loss function, you need to subclass the [Trainer] instead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_69.txt b/chunked/content_aware_chunking/_quicktour/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d22e874f1dd9b9c8eee84a6b50aa83c831cb26d --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_69.txt @@ -0,0 +1,2 @@ +Train with TensorFlow +All models are a standard tf.keras.Model so they can be trained in TensorFlow with the Keras API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_70.txt b/chunked/content_aware_chunking/_quicktour/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..46e031cf63cf61228670cda7e0f0b9c6483b5b3c --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_70.txt @@ -0,0 +1 @@ +🤗 Transformers provides the [~TFPreTrainedModel.prepare_tf_dataset] method to easily load your dataset as a tf.data.Dataset so you can start training right away with Keras' compile and fit methods. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_71.txt b/chunked/content_aware_chunking/_quicktour/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..beddb3267994bd761018ccf05c69a6fe48fda3c3 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_71.txt @@ -0,0 +1,19 @@ +You'll start with a [TFPreTrainedModel] or a tf.keras.Model: + +from transformers import TFAutoModelForSequenceClassification +model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + + +Load a preprocessing class like a tokenizer, image processor, feature extractor, or processor: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") + + +Create a function to tokenize the dataset: + +def tokenize_dataset(dataset): + return tokenizer(dataset["text"]) # doctest: +SKIP + + +Apply the tokenizer over the entire dataset with [~datasets.Dataset.map] and then pass the dataset and tokenizer to [~TFPreTrainedModel.prepare_tf_dataset]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_72.txt b/chunked/content_aware_chunking/_quicktour/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..72dd7f79d37b67dbc67a975ef2830aa26341f42b --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_72.txt @@ -0,0 +1,9 @@ +You can also change the batch size and shuffle the dataset here if you'd like: + +dataset = dataset.map(tokenize_dataset) # doctest: +SKIP +tf_dataset = model.prepare_tf_dataset( + dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer + ) # doctest: +SKIP + + +When you're ready, you can call compile and fit to start training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_73.txt b/chunked/content_aware_chunking/_quicktour/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b70d4d8398324ce062fad5ce2c628c25c9332c3 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_73.txt @@ -0,0 +1,4 @@ +Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +from tensorflow.keras.optimizers import Adam +model.compile(optimizer=Adam(3e-5)) # No loss argument! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_74.txt b/chunked/content_aware_chunking/_quicktour/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee912ee3ab1a7a1079a70bb995356f2b5650abd5 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_74.txt @@ -0,0 +1,4 @@ +model.fit(tf_dataset) # doctest: +SKIP + + +What's next? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_75.txt b/chunked/content_aware_chunking/_quicktour/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e1f7aee3f29b939682cbbd2c392d3e7fffb75f4 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_75.txt @@ -0,0 +1 @@ +Now that you've completed the 🤗 Transformers quick tour, check out our guides and learn how to do more specific things like writing a custom model, fine-tuning a model for a task, and how to train a model with a script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_quicktour/chunk_76.txt b/chunked/content_aware_chunking/_quicktour/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..6748d4a45fac333e3a41674a052c24f654baaa42 --- /dev/null +++ b/chunked/content_aware_chunking/_quicktour/chunk_76.txt @@ -0,0 +1 @@ +If you're interested in learning more about 🤗 Transformers core concepts, grab a cup of coffee and take a look at our Conceptual Guides! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_22.txt b/chunked/content_aware_chunking/_run_scripts/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..57312972ea4dded529e66cb96e53161b93668dca --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_22.txt @@ -0,0 +1 @@ +Set the number of GPUs to use with the nproc_per_node argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_23.txt b/chunked/content_aware_chunking/_run_scripts/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..59da674b8147aed5b4e701e49b824c459c7a45b8 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_23.txt @@ -0,0 +1,15 @@ +torchrun \ + --nproc_per_node 8 pytorch/summarization/run_summarization.py \ + --fp16 \ + --model_name_or_path google-t5/t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +TensorFlow scripts utilize a MirroredStrategy for distributed training, and you don't need to add any additional arguments to the training script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_24.txt b/chunked/content_aware_chunking/_run_scripts/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..be4103aef262eac1a0cab592f3a12595f9d5c774 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_24.txt @@ -0,0 +1 @@ +The TensorFlow script will use multiple GPUs by default if they are available. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_25.txt b/chunked/content_aware_chunking/_run_scripts/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..42eef4959912eec08393652389c21d730dd871bb --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_25.txt @@ -0,0 +1,3 @@ +Run a script on a TPU + +Tensor Processing Units (TPUs) are specifically designed to accelerate performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_26.txt b/chunked/content_aware_chunking/_run_scripts/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..0dcd03c6117fd45566ef4bf6d2fdf77a4e17c374 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_26.txt @@ -0,0 +1 @@ +PyTorch supports TPUs with the XLA deep learning compiler (see here for more details). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_27.txt b/chunked/content_aware_chunking/_run_scripts/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a13d5b46e502665b2222d3b6fc74ba58cde20ec --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_27.txt @@ -0,0 +1 @@ +To use a TPU, launch the xla_spawn.py script and use the num_cores argument to set the number of TPU cores you want to use. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_28.txt b/chunked/content_aware_chunking/_run_scripts/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..314ff34b33735221b727d4468cd67ff75e901770 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_28.txt @@ -0,0 +1,15 @@ +python xla_spawn.py --num_cores 8 \ + summarization/run_summarization.py \ + --model_name_or_path google-t5/t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate + +Tensor Processing Units (TPUs) are specifically designed to accelerate performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_29.txt b/chunked/content_aware_chunking/_run_scripts/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..d32c698dc52861c71dc1e60e8750207d89c69b32 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_29.txt @@ -0,0 +1 @@ +TensorFlow scripts utilize a TPUStrategy for training on TPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_30.txt b/chunked/content_aware_chunking/_run_scripts/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4685ae32275c461da68ccfcb25cf0489465aeca --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_30.txt @@ -0,0 +1 @@ +To use a TPU, pass the name of the TPU resource to the tpu argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_31.txt b/chunked/content_aware_chunking/_run_scripts/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e3593a05fe12040609a948da3a42a22f67199f1 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_31.txt @@ -0,0 +1,14 @@ +python run_summarization.py \ + --tpu name_of_tpu_resource \ + --model_name_or_path google-t5/t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size 8 \ + --per_device_eval_batch_size 16 \ + --num_train_epochs 3 \ + --do_train \ + --do_eval + +Run a script with 🤗 Accelerate +🤗 Accelerate is a PyTorch-only library that offers a unified method for training a model on several types of setups (CPU-only, multiple GPUs, TPUs) while maintaining complete visibility into the PyTorch training loop. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_32.txt b/chunked/content_aware_chunking/_run_scripts/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..47f5daae20fcb7dd32f906d524e720be0a8f66f6 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_32.txt @@ -0,0 +1,7 @@ +Make sure you have 🤗 Accelerate installed if you don't already have it: + +Note: As Accelerate is rapidly developing, the git version of accelerate must be installed to run the scripts + +pip install git+https://github.com/huggingface/accelerate + +Instead of the run_summarization.py script, you need to use the run_summarization_no_trainer.py script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_33.txt b/chunked/content_aware_chunking/_run_scripts/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa8056e9b16fb47a1a611f8537204771791f6e2f --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_33.txt @@ -0,0 +1 @@ +🤗 Accelerate supported scripts will have a task_no_trainer.py file in the folder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_34.txt b/chunked/content_aware_chunking/_run_scripts/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..f99ad016e4f44f6ccdc1861dd38cd1500e6eff0f --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_34.txt @@ -0,0 +1,16 @@ +Begin by running the following command to create and save a configuration file: + +accelerate config +Test your setup to make sure it is configured correctly: + +accelerate test +Now you are ready to launch the training: + +accelerate launch run_summarization_no_trainer.py \ + --model_name_or_path google-t5/t5-small \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir ~/tmp/tst-summarization +Use a custom dataset +The summarization script supports custom datasets as long as they are a CSV or JSON Line file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_35.txt b/chunked/content_aware_chunking/_run_scripts/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dd88fa1f685b21bfa0bfed9f65f109c3490532a --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_35.txt @@ -0,0 +1,3 @@ +When you use your own dataset, you need to specify several additional arguments: + +train_file and validation_file specify the path to your training and validation files. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_36.txt b/chunked/content_aware_chunking/_run_scripts/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..f515042e6e4445e50ddaa5146574f8fe6e2dd184 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_36.txt @@ -0,0 +1 @@ +text_column is the input text to summarize. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_37.txt b/chunked/content_aware_chunking/_run_scripts/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..5df52ebe8b5c75f8b37be7f435f2469134ac059a --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_37.txt @@ -0,0 +1 @@ +summary_column is the target text to output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_38.txt b/chunked/content_aware_chunking/_run_scripts/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..55412e09c0b7a7054e138a7eb61f2410a49d3238 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_38.txt @@ -0,0 +1,18 @@ +A summarization script using a custom dataset would look like this: + +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path google-t5/t5-small \ + --do_train \ + --do_eval \ + --train_file path_to_csv_or_jsonlines_file \ + --validation_file path_to_csv_or_jsonlines_file \ + --text_column text_column_name \ + --summary_column summary_column_name \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --overwrite_output_dir \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --predict_with_generate +Test a script +It is often a good idea to run your script on a smaller number of dataset examples to ensure everything works as expected before committing to an entire dataset which may take hours to complete. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_39.txt b/chunked/content_aware_chunking/_run_scripts/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..44fafcee7351a1b75eb71881f7e9379d3971619e --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_39.txt @@ -0,0 +1,22 @@ +Use the following arguments to truncate the dataset to a maximum number of samples: + +max_train_samples +max_eval_samples +max_predict_samples + +python examples/pytorch/summarization/run_summarization.py \ + --model_name_or_path google-t5/t5-small \ + --max_train_samples 50 \ + --max_eval_samples 50 \ + --max_predict_samples 50 \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate +Not all example scripts support the max_predict_samples argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_40.txt b/chunked/content_aware_chunking/_run_scripts/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..41618ae6ed92c1da6e9c12d77fad5afd16efb3df --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_40.txt @@ -0,0 +1,5 @@ +If you aren't sure whether your script supports this argument, add the -h argument to check: + +examples/pytorch/summarization/run_summarization.py -h +Resume training from checkpoint +Another helpful option to enable is resuming training from a previous checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_41.txt b/chunked/content_aware_chunking/_run_scripts/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..429bc51507b797bdf756968ef7703c0d0dbadd3c --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_41.txt @@ -0,0 +1 @@ +This will ensure you can pick up where you left off without starting over if your training gets interrupted. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_42.txt b/chunked/content_aware_chunking/_run_scripts/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..4399a5cc88b9f7b5dc3f0dcca89975dcd5847e98 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_42.txt @@ -0,0 +1 @@ +There are two methods to resume training from a checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_43.txt b/chunked/content_aware_chunking/_run_scripts/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..e554a3bf7326bf73a35e1b58f1febebdce532f13 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_43.txt @@ -0,0 +1 @@ +The first method uses the output_dir previous_output_dir argument to resume training from the latest checkpoint stored in output_dir. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_44.txt b/chunked/content_aware_chunking/_run_scripts/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..c659c30512d44a4797af998602dce350286e122b --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_44.txt @@ -0,0 +1,15 @@ +In this case, you should remove overwrite_output_dir: + +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path google-t5/t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --output_dir previous_output_dir \ + --predict_with_generate +The second method uses the resume_from_checkpoint path_to_specific_checkpoint argument to resume training from a specific checkpoint folder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_45.txt b/chunked/content_aware_chunking/_run_scripts/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b65f26409d7c9ca18d1a58b4cb78609dc47ecfd --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_45.txt @@ -0,0 +1,15 @@ +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path google-t5/t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --resume_from_checkpoint path_to_specific_checkpoint \ + --predict_with_generate +Share your model +All scripts can upload your final model to the Model Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_46.txt b/chunked/content_aware_chunking/_run_scripts/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcab35296c5bb6b8e827ebac39be678adc56d23c --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_46.txt @@ -0,0 +1,4 @@ +Make sure you are logged into Hugging Face before you begin: + +huggingface-cli login +Then add the push_to_hub argument to the script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_47.txt b/chunked/content_aware_chunking/_run_scripts/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..7405996f0d2108175fdeeda0e6386b5e72ac57c5 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_47.txt @@ -0,0 +1 @@ +This argument will create a repository with your Hugging Face username and the folder name specified in output_dir. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_48.txt b/chunked/content_aware_chunking/_run_scripts/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5d0d796890fc5b2f2d8b56514c7a32cfef7e640 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_48.txt @@ -0,0 +1 @@ +To give your repository a specific name, use the push_to_hub_model_id argument to add it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_49.txt b/chunked/content_aware_chunking/_run_scripts/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..d27d97960f0204905c4224ee536993cd3965cb8a --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_49.txt @@ -0,0 +1 @@ +The repository will be automatically listed under your namespace. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_run_scripts/chunk_50.txt b/chunked/content_aware_chunking/_run_scripts/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..fea95fa54f4e3add8ffbb7ca3785791e15f40538 --- /dev/null +++ b/chunked/content_aware_chunking/_run_scripts/chunk_50.txt @@ -0,0 +1,16 @@ +The following example shows how to upload a model with a specific repository name: + +python examples/pytorch/summarization/run_summarization.py + --model_name_or_path google-t5/t5-small \ + --do_train \ + --do_eval \ + --dataset_name cnn_dailymail \ + --dataset_config "3.0.0" \ + --source_prefix "summarize: " \ + --push_to_hub \ + --push_to_hub_model_id finetuned-t5-cnn_dailymail \ + --output_dir /tmp/tst-summarization \ + --per_device_train_batch_size=4 \ + --per_device_eval_batch_size=4 \ + --overwrite_output_dir \ + --predict_with_generate \ No newline at end of file diff --git a/chunked/content_aware_chunking/_sagemaker/chunk_1.txt b/chunked/content_aware_chunking/_sagemaker/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..447333f31d0862f1f0c8f2909e4c27721936906a --- /dev/null +++ b/chunked/content_aware_chunking/_sagemaker/chunk_1.txt @@ -0,0 +1 @@ +This page will be removed in transformers 5.0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_sagemaker/chunk_2.txt b/chunked/content_aware_chunking/_sagemaker/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa43639f402b0e7773c929822423e8ed6e072fc5 --- /dev/null +++ b/chunked/content_aware_chunking/_sagemaker/chunk_2.txt @@ -0,0 +1,4 @@ +Table of Content + +Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK +Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_15.txt b/chunked/content_aware_chunking/_serialization/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..7532dedc2a68eed23bf6c944b3c2c13a45fc23db --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_15.txt @@ -0,0 +1 @@ +export with 🤗 Optimum with optimum.onnxruntime. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_16.txt b/chunked/content_aware_chunking/_serialization/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..907f53506603f81b8e5069c6231db0baf74a2325 --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_16.txt @@ -0,0 +1,23 @@ +Exporting a 🤗 Transformers model to ONNX with CLI +To export a 🤗 Transformers model to ONNX, first install an extra dependency: + +pip install optimum[exporters] +To check out all available arguments, refer to the 🤗 Optimum docs, +or view help in command line: + +optimum-cli export onnx --help +To export a model's checkpoint from the 🤗 Hub, for example, distilbert/distilbert-base-uncased-distilled-squad, run the following command: + +optimum-cli export onnx --model distilbert/distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/ +You should see the logs indicating progress and showing where the resulting model.onnx is saved, like this: + +Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx + -[✓] ONNX model output names match reference model (start_logits, end_logits) + - Validating ONNX Model output "start_logits": + -[✓] (2, 16) matches (2, 16) + -[✓] all values close (atol: 0.0001) + - Validating ONNX Model output "end_logits": + -[✓] (2, 16) matches (2, 16) + -[✓] all values close (atol: 0.0001) +The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx +The example above illustrates exporting a checkpoint from 🤗 Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_17.txt b/chunked/content_aware_chunking/_serialization/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ce17a0bc63785d9eca593486bc3244e2f4ac392 --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_17.txt @@ -0,0 +1,2 @@ +When exporting a local model, first make sure that you +saved both the model's weights and tokenizer files in the same directory (local_path). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_18.txt b/chunked/content_aware_chunking/_serialization/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..82908c671a33883ef2d9cc6a36357a89926503ff --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_18.txt @@ -0,0 +1,2 @@ +When using CLI, pass the +local_path to the model argument instead of the checkpoint name on 🤗 Hub and provide the --task argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_19.txt b/chunked/content_aware_chunking/_serialization/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..33fac156e513cc30040b6e2932870bd8f9d5590b --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_19.txt @@ -0,0 +1 @@ +You can review the list of supported tasks in the 🤗 Optimum documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_20.txt b/chunked/content_aware_chunking/_serialization/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1506410be3b0b3fd0e6edcefdb4b20b50c4e89d --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_20.txt @@ -0,0 +1 @@ +If task argument is not provided, it will default to the model architecture without any task specific head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_21.txt b/chunked/content_aware_chunking/_serialization/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..000900ea70d7f10ac9bc42ea7934b369748786d7 --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_21.txt @@ -0,0 +1,4 @@ +optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/ +The resulting model.onnx file can then be run on one of the many +accelerators that support the ONNX +standard. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_22.txt b/chunked/content_aware_chunking/_serialization/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..bee6a759b8b65071bdf0dc764a0b384657bea372 --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_22.txt @@ -0,0 +1,9 @@ +For example, we can load and run the model with ONNX +Runtime as follows: +thon + +from transformers import AutoTokenizer +from optimum.onnxruntime import ORTModelForQuestionAnswering +tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx") +model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx") +inputs = tokenizer("What am I using? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_23.txt b/chunked/content_aware_chunking/_serialization/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..165614f6ee6fcf271931b4e5b6059e6670a504db --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_23.txt @@ -0,0 +1 @@ +", "Using DistilBERT with ONNX Runtime! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_24.txt b/chunked/content_aware_chunking/_serialization/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..616bff6b1c1cea1fe133ff78898866175fcad4d1 --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_24.txt @@ -0,0 +1,4 @@ +", return_tensors="pt") +outputs = model(**inputs) + +The process is identical for TensorFlow checkpoints on the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_25.txt b/chunked/content_aware_chunking/_serialization/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..a616afaefb792a061fddba9798f031f1b67e232d --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_25.txt @@ -0,0 +1,24 @@ +For instance, here's how you would +export a pure TensorFlow checkpoint from the Keras organization: + +optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/ +Exporting a 🤗 Transformers model to ONNX with optimum.onnxruntime +Alternative to CLI, you can export a 🤗 Transformers model to ONNX programmatically like so: +thon + +from optimum.onnxruntime import ORTModelForSequenceClassification +from transformers import AutoTokenizer +model_checkpoint = "distilbert_base_uncased_squad" +save_directory = "onnx/" +Load a model from transformers and export it to ONNX +ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True) +tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) +Save the onnx model and tokenizer +ort_model.save_pretrained(save_directory) +tokenizer.save_pretrained(save_directory) + +Exporting a model for an unsupported architecture +If you wish to contribute by adding support for a model that cannot be currently exported, you should first check if it is +supported in optimum.exporters.onnx, +and if it is not, contribute to 🤗 Optimum +directly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_26.txt b/chunked/content_aware_chunking/_serialization/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..59480f04d22311a269c703a4ccb7ae2cab80ab65 --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_26.txt @@ -0,0 +1,3 @@ +Exporting a model with transformers.onnx + +tranformers.onnx is no longer maintained, please export models with 🤗 Optimum as described above. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_27.txt b/chunked/content_aware_chunking/_serialization/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..73c6fbf2338a442fba98db06d5b3d11be4ae0acf --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_27.txt @@ -0,0 +1 @@ +This section will be removed in the future versions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_28.txt b/chunked/content_aware_chunking/_serialization/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f747b4ab7d4dea59bd762b703ef77813b14379f --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_28.txt @@ -0,0 +1,7 @@ +To export a 🤗 Transformers model to ONNX with tranformers.onnx, install extra dependencies: + +pip install transformers[onnx] +Use transformers.onnx package as a Python module to export a checkpoint using a ready-made configuration: + +python -m transformers.onnx --model=distilbert/distilbert-base-uncased onnx/ +This exports an ONNX graph of the checkpoint defined by the --model argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_29.txt b/chunked/content_aware_chunking/_serialization/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..f911008a0e87592de615818217106ad07221045b --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_29.txt @@ -0,0 +1 @@ +Pass any checkpoint on the 🤗 Hub or one that's stored locally. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_30.txt b/chunked/content_aware_chunking/_serialization/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cdb05f52578e92c54d808f7c539773371481284 --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_30.txt @@ -0,0 +1 @@ +The resulting model.onnx file can then be run on one of the many accelerators that support the ONNX standard. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_31.txt b/chunked/content_aware_chunking/_serialization/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..52689b79b971f2fff02371518503e31b28b6c2cd --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_31.txt @@ -0,0 +1,10 @@ +For example, +load and run the model with ONNX Runtime as follows: +thon + +from transformers import AutoTokenizer +from onnxruntime import InferenceSession +tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") +session = InferenceSession("onnx/model.onnx") +ONNX Runtime expects NumPy arrays as input +inputs = tokenizer("Using DistilBERT with ONNX Runtime! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_32.txt b/chunked/content_aware_chunking/_serialization/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..21d8fc1588e9312135909fc85c16555d2189f959 --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_32.txt @@ -0,0 +1,5 @@ +", return_tensors="np") +outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs)) + +The required output names (like ["last_hidden_state"]) can be obtained by taking a look at the ONNX configuration of +each model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_33.txt b/chunked/content_aware_chunking/_serialization/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f9ea8a43e1e140576679011ea77b68973ce5f00 --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_33.txt @@ -0,0 +1,10 @@ +For example, for DistilBERT we have: +thon + +from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig +config = DistilBertConfig() +onnx_config = DistilBertOnnxConfig(config) +print(list(onnx_config.outputs.keys())) +["last_hidden_state"] + +The process is identical for TensorFlow checkpoints on the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_34.txt b/chunked/content_aware_chunking/_serialization/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..3191768e140a8a1910ff2443af910d124ea6b88a --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_34.txt @@ -0,0 +1,4 @@ +For example, export a pure TensorFlow checkpoint like so: + +python -m transformers.onnx --model=keras-io/transformers-qa onnx/ +To export a model that's stored locally, save the model's weights and tokenizer files in the same directory (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_serialization/chunk_35.txt b/chunked/content_aware_chunking/_serialization/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..0712fd862bb7182cb93561df291e44a42e6a8fb5 --- /dev/null +++ b/chunked/content_aware_chunking/_serialization/chunk_35.txt @@ -0,0 +1,4 @@ +local-pt-checkpoint), +then export it to ONNX by pointing the --model argument of the transformers.onnx package to the desired directory: + +python -m transformers.onnx --model=local-pt-checkpoint onnx/ \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_39.txt b/chunked/content_aware_chunking/_task_summary/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..df65abaf8e5d2b9a40102ff43fe412396d2d9e92 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_39.txt @@ -0,0 +1 @@ +Depth information is also helpful for constructing 3D representations from 2D images and can be used to create high-quality 3D representations of biological structures or buildings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_40.txt b/chunked/content_aware_chunking/_task_summary/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..83b656fee0410a1d6c9fe0eb1a3cf739ffd31220 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_40.txt @@ -0,0 +1,13 @@ +There are two approaches to depth estimation: + +stereo: depths are estimated by comparing two images of the same image from slightly different angles +monocular: depths are estimated from a single image + +from transformers import pipeline +depth_estimator = pipeline(task="depth-estimation") +preds = depth_estimator( + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg" + ) + +Natural language processing +NLP tasks are among the most common types of tasks because text is such a natural way for us to communicate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_41.txt b/chunked/content_aware_chunking/_task_summary/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff39b644d276457f9c9108a13fa94ad7345edbee --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_41.txt @@ -0,0 +1 @@ +To get text into a format recognized by a model, it needs to be tokenized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_42.txt b/chunked/content_aware_chunking/_task_summary/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e0f7e88fef47718f0b0990e4771ac85d8df64e9 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_42.txt @@ -0,0 +1 @@ +This means dividing a sequence of text into separate words or subwords (tokens) and then converting these tokens into numbers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_43.txt b/chunked/content_aware_chunking/_task_summary/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f1cfcb4302972b9d4167b3efc40afcc2265119f --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_43.txt @@ -0,0 +1 @@ +As a result, you can represent a sequence of text as a sequence of numbers, and once you have a sequence of numbers, it can be input into a model to solve all sorts of NLP tasks! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_44.txt b/chunked/content_aware_chunking/_task_summary/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..0879ba68edc1f08513a4cd9edc5d411076b20422 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_44.txt @@ -0,0 +1,2 @@ +Text classification +Like classification tasks in any modality, text classification labels a sequence of text (it can be sentence-level, a paragraph, or a document) from a predefined set of classes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_45.txt b/chunked/content_aware_chunking/_task_summary/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffa59a8db6c162cd5416f622d5deb3332739947f --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_45.txt @@ -0,0 +1,4 @@ +There are many practical applications for text classification, some of which include: + +sentiment analysis: label text according to some polarity like positive or negative which can inform and support decision-making in fields like politics, finance, and marketing +content classification: label text according to some topic to help organize and filter information in news and social media feeds (weather, sports, finance, etc.) \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_46.txt b/chunked/content_aware_chunking/_task_summary/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..86d30f70a3cc24b482870e14f7066ef4bf551c44 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_46.txt @@ -0,0 +1,3 @@ +from transformers import pipeline +classifier = pipeline(task="sentiment-analysis") +preds = classifier("Hugging Face is the best thing since sliced bread!") \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_47.txt b/chunked/content_aware_chunking/_task_summary/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..310dacefc8b804a2e0e72a9f8f9a417b7a0cb648 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_47.txt @@ -0,0 +1,6 @@ +preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds] +preds +[{'score': 0.9991, 'label': 'POSITIVE'}] + +Token classification +In any NLP task, text is preprocessed by separating the sequence of text into individual words or subwords. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_48.txt b/chunked/content_aware_chunking/_task_summary/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4b3e954b6321072ffa67cbc1397b647c271c4e4 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_48.txt @@ -0,0 +1 @@ +These are known as tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_49.txt b/chunked/content_aware_chunking/_task_summary/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..003c99d1cacfe2c7b2207164aacf062ebc95f0e0 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_49.txt @@ -0,0 +1 @@ +Token classification assigns each token a label from a predefined set of classes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_50.txt b/chunked/content_aware_chunking/_task_summary/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..464c07129382dbddec3b7ed80ea2feafd1511fb8 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_50.txt @@ -0,0 +1,3 @@ +Two common types of token classification are: + +named entity recognition (NER): label a token according to an entity category like organization, person, location or date. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_51.txt b/chunked/content_aware_chunking/_task_summary/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..abed9156972fca6316f7181b59fb8210a27b8afb --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_51.txt @@ -0,0 +1 @@ +NER is especially popular in biomedical settings, where it can label genes, proteins, and drug names. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_52.txt b/chunked/content_aware_chunking/_task_summary/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..06390410538762b82c114d97cf9c88ef62ac5091 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_52.txt @@ -0,0 +1 @@ +part-of-speech tagging (POS): label a token according to its part-of-speech like noun, verb, or adjective. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_53.txt b/chunked/content_aware_chunking/_task_summary/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f2914fa13efcd5dcacb6595175a7dafbe3e4e16 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_53.txt @@ -0,0 +1 @@ +POS is useful for helping translation systems understand how two identical words are grammatically different (bank as a noun versus bank as a verb). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_54.txt b/chunked/content_aware_chunking/_task_summary/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcd84ac8ad3c274290fb8db82f16f4f5bcccfbc0 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_54.txt @@ -0,0 +1,3 @@ +from transformers import pipeline +classifier = pipeline(task="ner") +preds = classifier("Hugging Face is a French company based in New York City.") \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_55.txt b/chunked/content_aware_chunking/_task_summary/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..2275aef6f70d2bf66742cbb78d0fd0f859a3c8de --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_55.txt @@ -0,0 +1,22 @@ +preds = [ + { + "entity": pred["entity"], + "score": round(pred["score"], 4), + "index": pred["index"], + "word": pred["word"], + "start": pred["start"], + "end": pred["end"], + } + for pred in preds + ] +print(*preds, sep="\n") +{'entity': 'I-ORG', 'score': 0.9968, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2} +{'entity': 'I-ORG', 'score': 0.9293, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7} +{'entity': 'I-ORG', 'score': 0.9763, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12} +{'entity': 'I-MISC', 'score': 0.9983, 'index': 6, 'word': 'French', 'start': 18, 'end': 24} +{'entity': 'I-LOC', 'score': 0.999, 'index': 10, 'word': 'New', 'start': 42, 'end': 45} +{'entity': 'I-LOC', 'score': 0.9987, 'index': 11, 'word': 'York', 'start': 46, 'end': 50} +{'entity': 'I-LOC', 'score': 0.9992, 'index': 12, 'word': 'City', 'start': 51, 'end': 55} + +Question answering +Question answering is another token-level task that returns an answer to a question, sometimes with context (open-domain) and other times without context (closed-domain). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_56.txt b/chunked/content_aware_chunking/_task_summary/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c9042b73cb1f11e10155174c7a70801f0dc2aec --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_56.txt @@ -0,0 +1 @@ +This task happens whenever we ask a virtual assistant something like whether a restaurant is open. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_57.txt b/chunked/content_aware_chunking/_task_summary/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a54c0c3feea2696b56646adf5537181de5291f0 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_57.txt @@ -0,0 +1 @@ +It can also provide customer or technical support and help search engines retrieve the relevant information you're asking for. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_58.txt b/chunked/content_aware_chunking/_task_summary/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dee28aec897d89ff15691dcde73d8b7bc4acc96 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_58.txt @@ -0,0 +1,9 @@ +There are two common types of question answering: + +extractive: given a question and some context, the answer is a span of text from the context the model must extract +abstractive: given a question and some context, the answer is generated from the context; this approach is handled by the [Text2TextGenerationPipeline] instead of the [QuestionAnsweringPipeline] shown below + +from transformers import pipeline +question_answerer = pipeline(task="question-answering") +preds = question_answerer( + question="What is the name of the repository? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_59.txt b/chunked/content_aware_chunking/_task_summary/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..70d3479cdd1c1ffa6e59947956640b82af9abdee --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_59.txt @@ -0,0 +1,10 @@ +", + context="The name of the repository is huggingface/transformers", + ) +print( + f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}" + ) +score: 0.9327, start: 30, end: 54, answer: huggingface/transformers + +Summarization +Summarization creates a shorter version of a text from a longer one while trying to preserve most of the meaning of the original document. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_60.txt b/chunked/content_aware_chunking/_task_summary/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..187479715d99e50ff673c556e513008e353b8147 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_60.txt @@ -0,0 +1 @@ +Summarization is a sequence-to-sequence task; it outputs a shorter text sequence than the input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_61.txt b/chunked/content_aware_chunking/_task_summary/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e8048749e003ac4f1aa41a951cd6cee8b20cd39 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_61.txt @@ -0,0 +1 @@ +There are a lot of long-form documents that can be summarized to help readers quickly understand the main points. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_62.txt b/chunked/content_aware_chunking/_task_summary/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..67eb17c90a933adcded4be9d4eb9682c961f896f --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_62.txt @@ -0,0 +1 @@ +Legislative bills, legal and financial documents, patents, and scientific papers are a few examples of documents that could be summarized to save readers time and serve as a reading aid. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_63.txt b/chunked/content_aware_chunking/_task_summary/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..656e732437f869db6175ad189df6172fa8519487 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_63.txt @@ -0,0 +1,9 @@ +Like question answering, there are two types of summarization: + +extractive: identify and extract the most important sentences from the original text +abstractive: generate the target summary (which may include new words not in the input document) from the original text; the [SummarizationPipeline] uses the abstractive approach + +from transformers import pipeline +summarizer = pipeline(task="summarization") +summarizer( + "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_64.txt b/chunked/content_aware_chunking/_task_summary/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..aaa7877000c5054982937e2e62c4c56884c8ba5f --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_64.txt @@ -0,0 +1 @@ +For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_65.txt b/chunked/content_aware_chunking/_task_summary/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..e34bc3f31a9cfddc6bec6275a9829e7e245c595b --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_65.txt @@ -0,0 +1 @@ +On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_66.txt b/chunked/content_aware_chunking/_task_summary/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c24e76645414af27c812cd032a7c33d099262cf --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_66.txt @@ -0,0 +1 @@ +In the former task our best model outperforms even all previously reported ensembles." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_67.txt b/chunked/content_aware_chunking/_task_summary/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4bfddf4d5fda90c6a11b5386c264deb1c8f2e2f --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_67.txt @@ -0,0 +1,2 @@ +) +[{'summary_text': ' The Transformer is the first sequence transduction model based entirely on attention . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_68.txt b/chunked/content_aware_chunking/_task_summary/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..d09f0b9014cba591ea3057aa71dc578528971fe7 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_68.txt @@ -0,0 +1 @@ +It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_69.txt b/chunked/content_aware_chunking/_task_summary/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..172930daef1768b586e48c75943b61e8fae40dda --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_69.txt @@ -0,0 +1 @@ +For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'}] \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_70.txt b/chunked/content_aware_chunking/_task_summary/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbe9a84e4b4001cc7ee2bde9310eeac2cb0f298e --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_70.txt @@ -0,0 +1,2 @@ +Translation +Translation converts a sequence of text in one language to another. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_71.txt b/chunked/content_aware_chunking/_task_summary/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..a48d0e9d2c90dca41f31b4926cde2db37bc43df3 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_71.txt @@ -0,0 +1 @@ +It is important in helping people from different backgrounds communicate with each other, help translate content to reach wider audiences, and even be a learning tool to help people learn a new language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_72.txt b/chunked/content_aware_chunking/_task_summary/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..515a8f059fa7bf49f67e71586265bc17c9eca6b8 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_72.txt @@ -0,0 +1 @@ +Along with summarization, translation is a sequence-to-sequence task, meaning the model receives an input sequence and returns a target output sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_73.txt b/chunked/content_aware_chunking/_task_summary/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7b054a8cd7e5b6e88971732829910641a2ec485 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_73.txt @@ -0,0 +1 @@ +In the early days, translation models were mostly monolingual, but recently, there has been increasing interest in multilingual models that can translate between many pairs of languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_74.txt b/chunked/content_aware_chunking/_task_summary/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f0d7c34db551cbac289ff83b07aa0c75bbf8e72 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_74.txt @@ -0,0 +1,2 @@ +from transformers import pipeline +text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_75.txt b/chunked/content_aware_chunking/_task_summary/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..c24664f4666b98d69052491abf5891f0028ab781 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_75.txt @@ -0,0 +1,3 @@ +translator = pipeline(task="translation", model="google-t5/t5-small") +translator(text) +[{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}] \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_76.txt b/chunked/content_aware_chunking/_task_summary/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..fcf967ba6c4b64b92f2cad48fa3b831464aee994 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_76.txt @@ -0,0 +1,2 @@ +Language modeling +Language modeling is a task that predicts a word in a sequence of text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_77.txt b/chunked/content_aware_chunking/_task_summary/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..40dafce80310d4f3a06a3fbf783da7cf2161718f --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_77.txt @@ -0,0 +1 @@ +It has become a very popular NLP task because a pretrained language model can be finetuned for many other downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_78.txt b/chunked/content_aware_chunking/_task_summary/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..57458b590e1f34d861cd983c92adb8d349a932ab --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_78.txt @@ -0,0 +1 @@ +Lately, there has been a lot of interest in large language models (LLMs) which demonstrate zero- or few-shot learning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_79.txt b/chunked/content_aware_chunking/_task_summary/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..c75ce3cbfb559af0e60cb8e3f816a633f7e8ad3c --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_79.txt @@ -0,0 +1 @@ +This means the model can solve tasks it wasn't explicitly trained to do! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_80.txt b/chunked/content_aware_chunking/_task_summary/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef0324cdfce75bf8fe7d515022ad3b47d2e98fcc --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_80.txt @@ -0,0 +1 @@ +Language models can be used to generate fluent and convincing text, though you need to be careful since the text may not always be accurate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_81.txt b/chunked/content_aware_chunking/_task_summary/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..91e17630dd53876c144fe0b911b574f1d520a4ef --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_81.txt @@ -0,0 +1,6 @@ +There are two types of language modeling: + +causal: the model's objective is to predict the next token in a sequence, and future tokens are masked + +from transformers import pipeline +prompt = "Hugging Face is a community-based open-source platform for machine learning." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_82.txt b/chunked/content_aware_chunking/_task_summary/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..008a7f76a8d4ac4fd7478a694dceacc737f54c30 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_82.txt @@ -0,0 +1,6 @@ +generator = pipeline(task="text-generation") +generator(prompt) # doctest: +SKIP + +masked: the model's objective is to predict a masked token in a sequence with full access to the tokens in the sequence + +text = "Hugging Face is a community-based open-source for machine learning." \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_83.txt b/chunked/content_aware_chunking/_task_summary/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..2be8b375ffe4bbc6fff57989b17d4c08650eb338 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_83.txt @@ -0,0 +1,16 @@ +fill_mask = pipeline(task="fill-mask") +preds = fill_mask(text, top_k=1) +preds = [ + { + "score": round(pred["score"], 4), + "token": pred["token"], + "token_str": pred["token_str"], + "sequence": pred["sequence"], + } + for pred in preds + ] +preds +[{'score': 0.2236, + 'token': 1761, + 'token_str': ' platform', + 'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}] \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_84.txt b/chunked/content_aware_chunking/_task_summary/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ea6f52d7efa4bb77ac6d971789a5bf9648a72c8 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_84.txt @@ -0,0 +1,2 @@ +Multimodal +Multimodal tasks require a model to process multiple data modalities (text, image, audio, video) to solve a particular problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_85.txt b/chunked/content_aware_chunking/_task_summary/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..6aed654a391cf5c4d542fce22627800d7dfb2d01 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_85.txt @@ -0,0 +1 @@ +Image captioning is an example of a multimodal task where the model takes an image as input and outputs a sequence of text describing the image or some properties of the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_86.txt b/chunked/content_aware_chunking/_task_summary/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..b64ecef93059165951c3381068d761d42c56af50 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_86.txt @@ -0,0 +1 @@ +Although multimodal models work with different data types or modalities, internally, the preprocessing steps help the model convert all the data types into embeddings (vectors or list of numbers that holds meaningful information about the data). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_87.txt b/chunked/content_aware_chunking/_task_summary/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ce8c4d2b69c6a0a0e2bab86b782c660c585f7f8 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_87.txt @@ -0,0 +1 @@ +For a task like image captioning, the model learns relationships between image embeddings and text embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_88.txt b/chunked/content_aware_chunking/_task_summary/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b348be3e1a3c5014885d3bcf43383b0cd6b6a8d --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_88.txt @@ -0,0 +1,2 @@ +Document question answering +Document question answering is a task that answers natural language questions from a document. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_89.txt b/chunked/content_aware_chunking/_task_summary/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..61b08680cef9010a3ef3a69eea2d0aea2f66d854 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_89.txt @@ -0,0 +1 @@ +Unlike a token-level question answering task which takes text as input, document question answering takes an image of a document as input along with a question about the document and returns an answer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_90.txt b/chunked/content_aware_chunking/_task_summary/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d3a7fa3aa9df194ac8cb5495166100e0706a403 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_90.txt @@ -0,0 +1 @@ +Document question answering can be used to parse structured documents and extract key information from it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_91.txt b/chunked/content_aware_chunking/_task_summary/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..9dbadb99194ac1638c154b908a9fd2a8dfd56dfe --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_91.txt @@ -0,0 +1 @@ +In the example below, the total amount and change due can be extracted from a receipt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_92.txt b/chunked/content_aware_chunking/_task_summary/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..c884c57d924c1f2b298ffe8acaccd49dd3d5ca93 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_92.txt @@ -0,0 +1,8 @@ +from transformers import pipeline +from PIL import Image +import requests +url = "https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/2/image/image.jpg" +image = Image.open(requests.get(url, stream=True).raw) +doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices") +preds = doc_question_answerer( + question="What is the total amount? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_93.txt b/chunked/content_aware_chunking/_task_summary/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..3183ec5b74887b05b6f228f0f375e48c97c031e7 --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_93.txt @@ -0,0 +1,7 @@ +", + image=image, + ) +preds +[{'score': 0.8531, 'answer': '17,000', 'start': 4, 'end': 4}] + +Hopefully, this page has given you some more background information about all the types of tasks in each modality and the practical importance of each one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_task_summary/chunk_94.txt b/chunked/content_aware_chunking/_task_summary/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f54bf1a83e1eb8949a1b3953ecd08fec545e50e --- /dev/null +++ b/chunked/content_aware_chunking/_task_summary/chunk_94.txt @@ -0,0 +1 @@ +In the next section, you'll learn how 🤗 Transformers work to solve these tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_100.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..f73a2a433c38c4e7df5f89548b99fd07a6000948 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_100.txt @@ -0,0 +1 @@ +Mask classification groups pixels into N segments, and predicts N masks and their corresponding class label for a given image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_101.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..beb59a9c6391be4844cafddfd29725ebfc65437a --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_101.txt @@ -0,0 +1 @@ +We'll explain how Mask2Former works in this section, and then you can try finetuning SegFormer at the end. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_102.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..88b184280c107721b844f9f5c063b99ce230a0bc --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_102.txt @@ -0,0 +1,3 @@ +There are three main components to Mask2Former: + +A Swin backbone accepts an image and creates a low-resolution image feature map from 3 consecutive 3x3 convolutions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_103.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..677b9c5b2cd8696ded34c4606ef0eedc7764af12 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_103.txt @@ -0,0 +1 @@ +The feature map is passed to a pixel decoder which gradually upsamples the low-resolution features into high-resolution per-pixel embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_104.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..6395c7f0a1d1b0b75726f0758fe02f9ba3f496f7 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_104.txt @@ -0,0 +1 @@ +The pixel decoder actually generates multi-scale features (contains both low- and high-resolution features) with resolutions 1/32, 1/16, and 1/8th of the original image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_105.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..571b76ef2a999c5369b2262aab8d7ae63ad3650f --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_105.txt @@ -0,0 +1 @@ +Each of these feature maps of differing scales is fed successively to one Transformer decoder layer at a time in order to capture small objects from the high-resolution features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_106.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..155fc189bf120291eb57fb9cbd72aec19a0e73bf --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_106.txt @@ -0,0 +1 @@ +The key to Mask2Former is the masked attention mechanism in the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_107.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..e30da891dd83a56aaf43faa5b2af4da8327521e0 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_107.txt @@ -0,0 +1 @@ +Unlike cross-attention which can attend to the entire image, masked attention only focuses on a certain area of the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_108.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..da0a3d866b2ee17fb5a6bf99078b4a75ca678ed1 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_108.txt @@ -0,0 +1 @@ +This is faster and leads to better performance because the local features of an image are enough for the model to learn from. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_109.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..854c4b381d65dc11105110d3aa56e55c0b81a26c --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_109.txt @@ -0,0 +1 @@ +Like DETR, Mask2Former also uses learned object queries and combines them with the image features from the pixel decoder to make a set prediction (class label, mask prediction). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_110.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c0fcc37c597907aac6f8cbdd71bf1fc6a6913af --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_110.txt @@ -0,0 +1 @@ +The decoder hidden states are passed into a linear layer and transformed into logits over the class labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_111.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..3287b28ede9acb904a4c2ba4c0df654a5e59f9bc --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_111.txt @@ -0,0 +1 @@ +The cross-entropy loss is calculated between the logits and class label to find the most likely one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_112.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1d7ae73956999f38afc5e15a0ba0c1081440397 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_112.txt @@ -0,0 +1 @@ +The mask predictions are generated by combining the pixel-embeddings with the final decoder hidden states. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_113.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..2306940eca3ea594a57e41e3cb63fcc6821eceed --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_113.txt @@ -0,0 +1 @@ +The sigmoid cross-entropy and dice loss is calculated between the logits and the ground truth mask to find the most likely mask. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_114.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e91db0076dfc05bc82f82b7a9813edc5e4d6325 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_114.txt @@ -0,0 +1 @@ +Ready to try your hand at object detection? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_115.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f161734c5940b2b6ba5b210c86cc8c0c706715f --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_115.txt @@ -0,0 +1 @@ +Check out our complete image segmentation guide to learn how to finetune SegFormer and use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_116.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..b067e7083ff939b7da3a1ee052c1011ccd0b485b --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_116.txt @@ -0,0 +1,2 @@ +Depth estimation +GLPN, Global-Local Path Network, is a Transformer for depth estimation that combines a SegFormer encoder with a lightweight decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_117.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c2b070573358848cdb0cd782ed880240b8ddfdb --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_117.txt @@ -0,0 +1 @@ +Like ViT, an image is split into a sequence of patches, except these image patches are smaller. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_118.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..47ea94129e55f8509c84eaf9c47e74be925a34f8 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_118.txt @@ -0,0 +1 @@ +This is better for dense prediction tasks like segmentation or depth estimation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_119.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9864c551f163b8b6c1090bca5cd7e962e12f889 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_119.txt @@ -0,0 +1 @@ +The image patches are transformed into patch embeddings (see the image classification section for more details about how patch embeddings are created), which are fed to the encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_120.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..2126360620cd6b02cee692cad8d8176693ffb904 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_120.txt @@ -0,0 +1 @@ +The encoder accepts the patch embeddings, and passes them through several encoder blocks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_121.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..00ece587f86a9ba368511f9303520aa16cae9c20 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_121.txt @@ -0,0 +1 @@ +Each block consists of attention and Mix-FFN layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_122.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..6890240768fd40c0edb74606317282246eeac30a --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_122.txt @@ -0,0 +1 @@ +The purpose of the latter is to provide positional information. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_123.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..25528b6e8dfc5c0e3f4f8f9a21a1b05adf76a91d --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_123.txt @@ -0,0 +1 @@ +At the end of each encoder block is a patch merging layer for creating hierarchical representations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_124.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..d75f353a997512bd4b4a13ee4d9c77b2f7e6fa1f --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_124.txt @@ -0,0 +1 @@ +The features of each group of neighboring patches are concatenated, and a linear layer is applied to the concatenated features to reduce the number of patches to a resolution of 1/4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_125.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..acc5e836252739f7d0f23d0935f2959964f0b2e8 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_125.txt @@ -0,0 +1 @@ +This becomes the input to the next encoder block, where this whole process is repeated until you have image features with resolutions of 1/8, 1/16, and 1/32. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_126.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..61b5e27b9bd0c6f90f971832da5494eb052d187d --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_126.txt @@ -0,0 +1 @@ +A lightweight decoder takes the last feature map (1/32 scale) from the encoder and upsamples it to 1/16 scale. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_127.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0ceedbd35036782a5bb61b56fa7a68c125b5ca1 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_127.txt @@ -0,0 +1 @@ +From here, the feature is passed into a Selective Feature Fusion (SFF) module, which selects and combines local and global features from an attention map for each feature and then upsamples it to 1/8th. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_128.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a6b160631273b4510e9a5868323d6ace914de6d --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_128.txt @@ -0,0 +1 @@ +This process is repeated until the decoded features are the same size as the original image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_129.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..31905a32797244440aef9b8affdd5610a04c575c --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_129.txt @@ -0,0 +1 @@ +The output is passed through two convolution layers and then a sigmoid activation is applied to predict the depth of each pixel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_130.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..7df0c1e705ac2dcc8094ae6f9f02642807bea9d9 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_130.txt @@ -0,0 +1,2 @@ +Natural language processing +The Transformer was initially designed for machine translation, and since then, it has practically become the default architecture for solving all NLP tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_131.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..9dfccd29cc09c11cd537b2c8374dc4a3e5ce9188 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_131.txt @@ -0,0 +1 @@ +Some tasks lend themselves to the Transformer's encoder structure, while others are better suited for the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_132.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..551c619694eb24c1146561c113df20306b0a8134 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_132.txt @@ -0,0 +1 @@ +Still, other tasks make use of both the Transformer's encoder-decoder structure. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_133.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..f94e90b4602d29ec556fee8ddf3e0c9febfcce42 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_133.txt @@ -0,0 +1,2 @@ +Text classification +BERT is an encoder-only model and is the first model to effectively implement deep bidirectionality to learn richer representations of the text by attending to words on both sides. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_134.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..d495c40f0cd7db7ae8a7ae92145ddaf24e24fdea --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_134.txt @@ -0,0 +1 @@ +BERT uses WordPiece tokenization to generate a token embedding of the text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_135.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a241dbfff47317a46072bb471f173dd6701f758 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_135.txt @@ -0,0 +1 @@ +To tell the difference between a single sentence and a pair of sentences, a special [SEP] token is added to differentiate them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_136.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b262c6ad728cdf9bdb2b09e06c1c5ae612aa93f --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_136.txt @@ -0,0 +1 @@ +A special [CLS] token is added to the beginning of every sequence of text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_137.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..755f5430913d30d642a688b68b186fc6d052ebfc --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_137.txt @@ -0,0 +1 @@ +The final output with the [CLS] token is used as the input to the classification head for classification tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_138.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..56cee1c41269fb5fd73a686cb2a8454198aaeb5c --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_138.txt @@ -0,0 +1 @@ +BERT also adds a segment embedding to denote whether a token belongs to the first or second sentence in a pair of sentences. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_139.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1496b8119a7feeebbbd91d70ce8e43df4861736 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_139.txt @@ -0,0 +1 @@ +BERT is pretrained with two objectives: masked language modeling and next-sentence prediction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_140.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..d81069c4fbf937fa2b9a8a9fa00db360ae6bba19 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_140.txt @@ -0,0 +1 @@ +In masked language modeling, some percentage of the input tokens are randomly masked, and the model needs to predict these. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_141.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..4597ccfde9a4cc44594202b004175024a7c633c0 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_141.txt @@ -0,0 +1 @@ +This solves the issue of bidirectionality, where the model could cheat and see all the words and "predict" the next word. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_142.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b3ac9c95ce29de4aeff0c0942f0afa119e3ba4f --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_142.txt @@ -0,0 +1 @@ +The final hidden states of the predicted mask tokens are passed to a feedforward network with a softmax over the vocabulary to predict the masked word. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_143.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..cef895c1cfe6c3e1554db3356d55ad1096701aee --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_143.txt @@ -0,0 +1 @@ +The second pretraining object is next-sentence prediction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_144.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..f53fd024f587efe35211b60b7a9068a90138516b --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_144.txt @@ -0,0 +1 @@ +The model must predict whether sentence B follows sentence A. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_145.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb8fbf7d7656976b2225937341bbfad3f6a38a84 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_145.txt @@ -0,0 +1 @@ +Half of the time sentence B is the next sentence, and the other half of the time, sentence B is a random sentence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_146.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..af3ab2e630d9618e2fefecfdbb72bfd3bb3c03e8 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_146.txt @@ -0,0 +1 @@ +The prediction, whether it is the next sentence or not, is passed to a feedforward network with a softmax over the two classes (IsNext and NotNext). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_147.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..73f2c3ed4c60cacb24543f136df3fe88eca97139 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_147.txt @@ -0,0 +1 @@ +The input embeddings are passed through multiple encoder layers to output some final hidden states. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_148.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a7bb5f709f8d6f25803b211fcd562f030c7054c --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_148.txt @@ -0,0 +1 @@ +To use the pretrained model for text classification, add a sequence classification head on top of the base BERT model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_149.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5135684d70ddd9e818f2f458cfcfd5a64ca3730 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_149.txt @@ -0,0 +1 @@ +The sequence classification head is a linear layer that accepts the final hidden states and performs a linear transformation to convert them into logits. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_150.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..801196aac2f97e81fb0733f7070af0c03bb1549a --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_150.txt @@ -0,0 +1 @@ +The cross-entropy loss is calculated between the logits and target to find the most likely label. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_151.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d6bc7b03686165e94a361473f693500ab7b0a7c --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_151.txt @@ -0,0 +1 @@ +Ready to try your hand at text classification? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_152.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9f094c1d2423ad65517f6c8f62eac4dbeacbd86 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_152.txt @@ -0,0 +1 @@ +Check out our complete text classification guide to learn how to finetune DistilBERT and use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_153.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..896b54a0bc31e5b934179b81d85165474f0e6246 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_153.txt @@ -0,0 +1,2 @@ +Token classification +To use BERT for token classification tasks like named entity recognition (NER), add a token classification head on top of the base BERT model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_154.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..83dc64cc8484a1bfbba2c1ffec32973db9c8f860 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_154.txt @@ -0,0 +1 @@ +The token classification head is a linear layer that accepts the final hidden states and performs a linear transformation to convert them into logits. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_155.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..28e340b97a412f714d9c4f76e68febac6e99e373 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_155.txt @@ -0,0 +1 @@ +The cross-entropy loss is calculated between the logits and each token to find the most likely label. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_156.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f5ab0fd7cdd9e7cdd3405a7f86bd6c8b1f78186 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_156.txt @@ -0,0 +1 @@ +Ready to try your hand at token classification? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_157.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dba82e5cd7abb9fbb34269a4ab18361104102b9 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_157.txt @@ -0,0 +1 @@ +Check out our complete token classification guide to learn how to finetune DistilBERT and use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_158.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..687271f71e62523293631283f51635a52bf607cd --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_158.txt @@ -0,0 +1,2 @@ +Question answering +To use BERT for question answering, add a span classification head on top of the base BERT model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_159.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..6000a5acff92eb1d15346bd03da8536cb85d2857 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_159.txt @@ -0,0 +1 @@ +This linear layer accepts the final hidden states and performs a linear transformation to compute the span start and end logits corresponding to the answer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_160.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..fcd6d4ca8e322b5dbb9ecb981fe2dc2e4b1c0cc6 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_160.txt @@ -0,0 +1 @@ +The cross-entropy loss is calculated between the logits and the label position to find the most likely span of text corresponding to the answer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_161.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9b0f3e135d7081d90b745ee2b3ee0d9fe6333c5 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_161.txt @@ -0,0 +1 @@ +Ready to try your hand at question answering? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_162.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..89abeb7c4fdbdc40cc8ec4a218e8445301df13ea --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_162.txt @@ -0,0 +1 @@ +Check out our complete question answering guide to learn how to finetune DistilBERT and use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_163.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..910a966322351bb3b5bf6c9bde890d3af53c1fb4 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_163.txt @@ -0,0 +1 @@ +💡 Notice how easy it is to use BERT for different tasks once it's been pretrained. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_164.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd4f8fad2ce371a7204cba0c05194ef11b54d569 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_164.txt @@ -0,0 +1 @@ +You only need to add a specific head to the pretrained model to manipulate the hidden states into your desired output! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_165.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..0358013d0172ad1b93c800c5017a7c90e8524448 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_165.txt @@ -0,0 +1,2 @@ +Text generation +GPT-2 is a decoder-only model pretrained on a large amount of text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_166.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..64c2222d75e530d1cc076434b779987a57f9bb6e --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_166.txt @@ -0,0 +1 @@ +It can generate convincing (though not always true!) \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_167.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0af98414ff7e2b14caf15853541f8c87ed87faa --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_167.txt @@ -0,0 +1 @@ +text given a prompt and complete other NLP tasks like question answering despite not being explicitly trained to. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_168.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..82f7f405e9b701cbc712cc08863c9ad0a11f93c8 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_168.txt @@ -0,0 +1 @@ +GPT-2 uses byte pair encoding (BPE) to tokenize words and generate a token embedding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_169.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..a15ec0a83cffadfdccfeb01efdc46ed738e642ab --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_169.txt @@ -0,0 +1 @@ +Positional encodings are added to the token embeddings to indicate the position of each token in the sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_170.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..2de06c08a742da5d3f90cb7607bad7bded2e9554 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_170.txt @@ -0,0 +1 @@ +The input embeddings are passed through multiple decoder blocks to output some final hidden state. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_171.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..86ec791287558accde1960308e35e01f230120e6 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_171.txt @@ -0,0 +1 @@ +Within each decoder block, GPT-2 uses a masked self-attention layer which means GPT-2 can't attend to future tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_172.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4fcf0e4c023892f982159644666c50e09b33703 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_172.txt @@ -0,0 +1 @@ +It is only allowed to attend to tokens on the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_173.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..01a542eb75c458c72dc490b4bc6cdfd5d4ca9d88 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_173.txt @@ -0,0 +1 @@ +This is different from BERT's [mask] token because, in masked self-attention, an attention mask is used to set the score to 0 for future tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_174.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f924e22ed6052d622a73227c83685357a1f3824 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_174.txt @@ -0,0 +1 @@ +The output from the decoder is passed to a language modeling head, which performs a linear transformation to convert the hidden states into logits. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_175.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a612d31199b537a4f61acd38ab663c46a6929a0 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_175.txt @@ -0,0 +1 @@ +The label is the next token in the sequence, which are created by shifting the logits to the right by one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_176.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc125630b663df580d20ad3b54d516bceba2eee1 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_176.txt @@ -0,0 +1 @@ +The cross-entropy loss is calculated between the shifted logits and the labels to output the next most likely token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_177.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb19ccdd8caa6f3936232f1b3c1eb9f704ce07fd --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_177.txt @@ -0,0 +1 @@ +GPT-2's pretraining objective is based entirely on causal language modeling, predicting the next word in a sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_178.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf5cb9664ea422f82db3d753bec649bcd5a4c724 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_178.txt @@ -0,0 +1 @@ +This makes GPT-2 especially good at tasks that involve generating text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_179.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..f49dd2a84f06f6b2ecb10d813a273b2a4e980ff3 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_179.txt @@ -0,0 +1 @@ +Ready to try your hand at text generation? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_180.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..97e9c869f7b4c222249c59bf0b8062aafe801a99 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_180.txt @@ -0,0 +1 @@ +Check out our complete causal language modeling guide to learn how to finetune DistilGPT-2 and use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_181.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..520fbac348612a268b6d847b7157c7a648029a7f --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_181.txt @@ -0,0 +1 @@ +For more information about text generation, check out the text generation strategies guide! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_182.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdebbb898e5b815527be8815aa97ce02c0e5e86e --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_182.txt @@ -0,0 +1,2 @@ +Summarization +Encoder-decoder models like BART and T5 are designed for the sequence-to-sequence pattern of a summarization task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_183.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..6bbc1af9a490bc0f79dd604515068146fcbd1560 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_183.txt @@ -0,0 +1 @@ +We'll explain how BART works in this section, and then you can try finetuning T5 at the end. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_184.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a81782147fe0e43856e43969e95e398b6833769 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_184.txt @@ -0,0 +1 @@ +BART's encoder architecture is very similar to BERT and accepts a token and positional embedding of the text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_185.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d6f840f3ec23375e8127383bcf0ef35d62f8299 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_185.txt @@ -0,0 +1 @@ +BART is pretrained by corrupting the input and then reconstructing it with the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_186.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..a533e2be98cdc524a2ec74ba3190873309be643e --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_186.txt @@ -0,0 +1 @@ +Unlike other encoders with specific corruption strategies, BART can apply any type of corruption. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_187.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3ba933cc5500e6727142726f521034425cf931c --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_187.txt @@ -0,0 +1 @@ +The text infilling corruption strategy works the best though. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_188.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc4f752c879a7a171a76b527ec13030ca58d67f7 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_188.txt @@ -0,0 +1 @@ +In text infilling, a number of text spans are replaced with a single [mask] token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_189.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_189.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d8d34b0ba88ea52c5f1ca32b8d14a9c1c122152 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_189.txt @@ -0,0 +1 @@ +This is important because the model has to predict the masked tokens, and it teaches the model to predict the number of missing tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_190.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_190.txt new file mode 100644 index 0000000000000000000000000000000000000000..7010c232578b6b60d71c9707e6818758044872c7 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_190.txt @@ -0,0 +1 @@ +The input embeddings and masked spans are passed through the encoder to output some final hidden states, but unlike BERT, BART doesn't add a final feedforward network at the end to predict a word. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_191.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_191.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3acd2ba9ec4f79281b8bd4e97483edc60c673f5 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_191.txt @@ -0,0 +1 @@ +The encoder's output is passed to the decoder, which must predict the masked tokens and any uncorrupted tokens from the encoder's output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_192.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_192.txt new file mode 100644 index 0000000000000000000000000000000000000000..27c1c1ca14bd3899ede6ee7ee53b91c9a2483719 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_192.txt @@ -0,0 +1 @@ +This gives additional context to help the decoder restore the original text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_193.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_193.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f924e22ed6052d622a73227c83685357a1f3824 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_193.txt @@ -0,0 +1 @@ +The output from the decoder is passed to a language modeling head, which performs a linear transformation to convert the hidden states into logits. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_194.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_194.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fe3efd48d5c98f561a4fc812a50dee4522f839d --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_194.txt @@ -0,0 +1 @@ +The cross-entropy loss is calculated between the logits and the label, which is just the token shifted to the right. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_195.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_195.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e49440e69e305d846aa40a0e663f1548c7fa100 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_195.txt @@ -0,0 +1 @@ +Ready to try your hand at summarization? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_196.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_196.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bab9d3150909a5de3d0ae555d24809c59833a52 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_196.txt @@ -0,0 +1 @@ +Check out our complete summarization guide to learn how to finetune T5 and use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_197.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_197.txt new file mode 100644 index 0000000000000000000000000000000000000000..520fbac348612a268b6d847b7157c7a648029a7f --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_197.txt @@ -0,0 +1 @@ +For more information about text generation, check out the text generation strategies guide! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_198.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_198.txt new file mode 100644 index 0000000000000000000000000000000000000000..9604cf568bfbd2edf3a1ba943d6a2198ed8daf8c --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_198.txt @@ -0,0 +1,2 @@ +Translation +Translation is another example of a sequence-to-sequence task, which means you can use an encoder-decoder model like BART or T5 to do it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_199.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_199.txt new file mode 100644 index 0000000000000000000000000000000000000000..6bbc1af9a490bc0f79dd604515068146fcbd1560 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_199.txt @@ -0,0 +1 @@ +We'll explain how BART works in this section, and then you can try finetuning T5 at the end. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_200.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_200.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1cf7b2959d0a8b0d91f188dfb2cb133cc639d6b --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_200.txt @@ -0,0 +1 @@ +BART adapts to translation by adding a separate randomly initialized encoder to map a source language to an input that can be decoded into the target language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_201.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_201.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0cf236047fd8c2355b584417d26188c65efc193 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_201.txt @@ -0,0 +1 @@ +This new encoder's embeddings are passed to the pretrained encoder instead of the original word embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_202.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_202.txt new file mode 100644 index 0000000000000000000000000000000000000000..0aa60a78a78ecbb12fcffc014db09a5ce0d64f4d --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_202.txt @@ -0,0 +1 @@ +The source encoder is trained by updating the source encoder, positional embeddings, and input embeddings with the cross-entropy loss from the model output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_203.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_203.txt new file mode 100644 index 0000000000000000000000000000000000000000..075e4bb272af5c8584c0d5784ae003d9d4bae097 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_203.txt @@ -0,0 +1 @@ +The model parameters are frozen in this first step, and all the model parameters are trained together in the second step. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_204.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_204.txt new file mode 100644 index 0000000000000000000000000000000000000000..aaab00bf43e9208e92f3b04877edf2cf38c3c523 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_204.txt @@ -0,0 +1 @@ +BART has since been followed up by a multilingual version, mBART, intended for translation and pretrained on many different languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_205.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_205.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea0e8f4f6c40d0fc3e69c7eaaba42257515503c4 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_205.txt @@ -0,0 +1 @@ +Ready to try your hand at translation? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_206.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_206.txt new file mode 100644 index 0000000000000000000000000000000000000000..a71f39d9bfa219051b69bc288c32bdce71101c65 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_206.txt @@ -0,0 +1 @@ +Check out our complete translation guide to learn how to finetune T5 and use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_207.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_207.txt new file mode 100644 index 0000000000000000000000000000000000000000..520fbac348612a268b6d847b7157c7a648029a7f --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_207.txt @@ -0,0 +1 @@ +For more information about text generation, check out the text generation strategies guide! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_64.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..f68e401b015f01c6844e56d501f4b61cd7262dce --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_64.txt @@ -0,0 +1 @@ +You can feed this output to another convolutional layer, and with each successive layer, the network learns more complex and abstract things like hotdogs or rockets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_65.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..17605deabfeadbe11c11b61f11200bac580bfe88 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_65.txt @@ -0,0 +1 @@ +Between convolutional layers, it is common to add a pooling layer to reduce dimensionality and make the model more robust to variations of a feature's position. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_66.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e4977540326afe1261897e37e9153b856c24e02 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_66.txt @@ -0,0 +1,3 @@ +ConvNeXT modernizes a CNN in five ways: + +Change the number of blocks in each stage and "patchify" an image with a larger stride and corresponding kernel size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_67.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0b36ca90223481503500c35b8016c470a490b0c --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_67.txt @@ -0,0 +1 @@ +The non-overlapping sliding window makes this patchifying strategy similar to how ViT splits an image into patches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_68.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8c950c23f79099f28d8ca68b61fae68635a9cf5 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_68.txt @@ -0,0 +1 @@ +A bottleneck layer shrinks the number of channels and then restores it because it is faster to do a 1x1 convolution, and you can increase the depth. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_69.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..133cd68dfef1f4cd9d4815069d68688d7cfb800a --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_69.txt @@ -0,0 +1 @@ +An inverted bottleneck does the opposite by expanding the number of channels and shrinking them, which is more memory efficient. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_70.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..269d8504246c37c4d6cb38747bc86d95b27bd2e8 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_70.txt @@ -0,0 +1 @@ +Replace the typical 3x3 convolutional layer in the bottleneck layer with depthwise convolution, which applies a convolution to each input channel separately and then stacks them back together at the end. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_71.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4044792daec09a397576d5f7c0ecbadee72f1f0 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_71.txt @@ -0,0 +1 @@ +This widens the network width for improved performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_72.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..682cf6324f3cc59ce98cc6a0db4d04e87a8203ab --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_72.txt @@ -0,0 +1 @@ +ViT has a global receptive field which means it can see more of an image at once thanks to its attention mechanism. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_73.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..dffe71fa05ece0d2023ab6d92a9a379b6a9c60f6 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_73.txt @@ -0,0 +1 @@ +ConvNeXT attempts to replicate this effect by increasing the kernel size to 7x7. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_74.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf82ea443526c700b9cadd83b3d47e96fa9c88f7 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_74.txt @@ -0,0 +1 @@ +ConvNeXT also makes several layer design changes that imitate Transformer models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_75.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..e68ad4aa826265c3a976d5692e8ce5a5f1bb7e7b --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_75.txt @@ -0,0 +1 @@ +There are fewer activation and normalization layers, the activation function is switched to GELU instead of ReLU, and it uses LayerNorm instead of BatchNorm. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_76.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..f45b03a69211ee703b78add3a346a919a0c7dd14 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_76.txt @@ -0,0 +1 @@ +The output from the convolution blocks is passed to a classification head which converts the outputs into logits and calculates the cross-entropy loss to find the most likely label. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_77.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..702990cb98ea58d2859fb55d1191c738f52d2bd6 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_77.txt @@ -0,0 +1,2 @@ +Object detection +DETR, DEtection TRansformer, is an end-to-end object detection model that combines a CNN with a Transformer encoder-decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_78.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..87bece58a8603616c508e24b726c52c0861de373 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_78.txt @@ -0,0 +1 @@ +A pretrained CNN backbone takes an image, represented by its pixel values, and creates a low-resolution feature map of it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_79.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..632c4bc8f53800ce243a5d88e4546ba25a8e4fdc --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_79.txt @@ -0,0 +1 @@ +A 1x1 convolution is applied to the feature map to reduce dimensionality and it creates a new feature map with a high-level image representation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_80.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..2033271cff411aff768e03c15fb03c0383091c31 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_80.txt @@ -0,0 +1 @@ +Since the Transformer is a sequential model, the feature map is flattened into a sequence of feature vectors that are combined with positional embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_81.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c0ec5c759a55532f6d4b460ef6164badaf3b3b5 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_81.txt @@ -0,0 +1 @@ +The feature vectors are passed to the encoder, which learns the image representations using its attention layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_82.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..24a087a3f6f65761d03180c743006fa7b4a4265a --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_82.txt @@ -0,0 +1 @@ +Next, the encoder hidden states are combined with object queries in the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_83.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd95662bdbf81b652db02f193acfbdb745c04af8 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_83.txt @@ -0,0 +1 @@ +Object queries are learned embeddings that focus on the different regions of an image, and they're updated as they progress through each attention layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_84.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..913016d464d7828e81e858be2dd60359eefead00 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_84.txt @@ -0,0 +1 @@ +The decoder hidden states are passed to a feedforward network that predicts the bounding box coordinates and class label for each object query, or no object if there isn't one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_85.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0625c13216930e732f5f1f344008ee70c2307d3 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_85.txt @@ -0,0 +1 @@ +DETR decodes each object query in parallel to output N final predictions, where N is the number of queries. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_86.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..37e3a1706eaa6360733b1ceed40535dfd8a43be6 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_86.txt @@ -0,0 +1 @@ +Unlike a typical autoregressive model that predicts one element at a time, object detection is a set prediction task (bounding box, class label) that makes N predictions in a single pass. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_87.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a0fe1d02d056016c467da8edbba5997750eb821 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_87.txt @@ -0,0 +1 @@ +DETR uses a bipartite matching loss during training to compare a fixed number of predictions with a fixed set of ground truth labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_88.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..7caa0d4e326d83766df63b964cde552fb0960bce --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_88.txt @@ -0,0 +1 @@ +If there are fewer ground truth labels in the set of N labels, then they're padded with a no object class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_89.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a6706dcf296036523aabf8373466dfae2772b02 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_89.txt @@ -0,0 +1 @@ +This loss function encourages DETR to find a one-to-one assignment between the predictions and ground truth labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_90.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..024e76818e8f6634a62127c2a16cf0adf33e6116 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_90.txt @@ -0,0 +1 @@ +If either the bounding boxes or class labels aren't correct, a loss is incurred. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_91.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc6a926add10543a2dcbe39b158e97f866281c1a --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_91.txt @@ -0,0 +1 @@ +Likewise, if DETR predicts an object that doesn't exist, it is penalized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_92.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bf3f8804cac1441f70de0dd3677f1c919b5b2ad --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_92.txt @@ -0,0 +1 @@ +This encourages DETR to find other objects in an image instead of focusing on one really prominent object. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_93.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac0699dfc4e0721c925c5f1f31479ff558c5862a --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_93.txt @@ -0,0 +1 @@ +An object detection head is added on top of DETR to find the class label and the coordinates of the bounding box. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_94.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd336ce12178f14491b2a1a88338b944747d79e1 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_94.txt @@ -0,0 +1 @@ +There are two components to the object detection head: a linear layer to transform the decoder hidden states into logits over the class labels, and a MLP to predict the bounding box. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_95.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e91db0076dfc05bc82f82b7a9813edc5e4d6325 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_95.txt @@ -0,0 +1 @@ +Ready to try your hand at object detection? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_96.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee19bd667a35e873f022ca48e40f03956b7465ce --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_96.txt @@ -0,0 +1 @@ +Check out our complete object detection guide to learn how to finetune DETR and use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_97.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..192cb291450191d08b83e281c7632b2bebba6ddd --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_97.txt @@ -0,0 +1,2 @@ +Image segmentation +Mask2Former is a universal architecture for solving all types of image segmentation tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_98.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..a351bc9e6263477c650770d2b0d6f1095735e39b --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_98.txt @@ -0,0 +1 @@ +Traditional segmentation models are typically tailored towards a particular subtask of image segmentation, like instance, semantic or panoptic segmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tasks_explained/chunk_99.txt b/chunked/content_aware_chunking/_tasks_explained/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..114a542833c965f2b61f0822c86ba860a5b60851 --- /dev/null +++ b/chunked/content_aware_chunking/_tasks_explained/chunk_99.txt @@ -0,0 +1 @@ +Mask2Former frames each of those tasks as a mask classification problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_100.txt b/chunked/content_aware_chunking/_testing/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0cac203952ae040d0ed03986e2711b5eb295e02 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_100.txt @@ -0,0 +1,3 @@ +Files and directories +In tests often we need to know where things are relative to the current test file, and it's not trivial since the test +could be invoked from more than one directory or could reside in sub-directories with different depths. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_101.txt b/chunked/content_aware_chunking/_testing/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1fff1830f52283b835dcf416085efa7fd2df854 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_101.txt @@ -0,0 +1,7 @@ +A helper class +transformers.test_utils.TestCasePlus solves this problem by sorting out all the basic paths and provides easy +accessors to them: + +pathlib objects (all fully resolved): + +test_file_path - the current test file path, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_102.txt b/chunked/content_aware_chunking/_testing/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a8d0372bce0e2536e1d2a8eb9e17ac3ed0d6d78 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_102.txt @@ -0,0 +1,8 @@ +__file__ + +test_file_dir - the directory containing the current test file +tests_dir - the directory of the tests test suite +examples_dir - the directory of the examples test suite +repo_root_dir - the directory of the repository + +src_dir - the directory of src (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_103.txt b/chunked/content_aware_chunking/_testing/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..acea1320d1bb4ff224573f843eaf2b81feb6fec6 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_103.txt @@ -0,0 +1,14 @@ +where the transformers sub-dir resides) + +stringified paths---same as above but these return paths as strings, rather than pathlib objects: + +test_file_path_str + +test_file_dir_str +tests_dir_str +examples_dir_str +repo_root_dir_str +src_dir_str + +To start using those all you need is to make sure that the test resides in a subclass of +transformers.test_utils.TestCasePlus. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_104.txt b/chunked/content_aware_chunking/_testing/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..251d0e0f9646496587ed09c2944f151f441343e7 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_104.txt @@ -0,0 +1,9 @@ +For example: +thon +from transformers.testing_utils import TestCasePlus +class PathExampleTest(TestCasePlus): + def test_something_involving_local_locations(self): + data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro" + +If you don't need to manipulate paths via pathlib or you just need a path as a string, you can always invoked +str() on the pathlib object or use the accessors ending with _str. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_105.txt b/chunked/content_aware_chunking/_testing/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..f676f4cb8fd763e27aaa7cd665a94ae3db0bad88 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_105.txt @@ -0,0 +1,10 @@ +For example: +thon +from transformers.testing_utils import TestCasePlus +class PathExampleTest(TestCasePlus): + def test_something_involving_stringified_locations(self): + examples_dir = self.examples_dir_str + +Temporary files and directories +Using unique temporary files and directories are essential for parallel test running, so that the tests won't overwrite +each other's data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_106.txt b/chunked/content_aware_chunking/_testing/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..21422512dce7112ac77749ec2403b34fb876214f --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_106.txt @@ -0,0 +1,2 @@ +Also we want to get the temporary files and directories removed at the end of each test that created +them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_107.txt b/chunked/content_aware_chunking/_testing/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..869b2f8a776715a458954dcc1cf64014c7a2566a --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_107.txt @@ -0,0 +1 @@ +Therefore, using packages like tempfile, which address these needs is essential. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_108.txt b/chunked/content_aware_chunking/_testing/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..59fdba6765e74259b509fe1125ed7cbd37edd3e6 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_108.txt @@ -0,0 +1,2 @@ +However, when debugging tests, you need to be able to see what goes into the temporary file or directory and you want +to know it's exact path and not having it randomized on every test re-run. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_109.txt b/chunked/content_aware_chunking/_testing/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..e794edf3b63f716b0fb3d3b0d4a8f10b8453854b --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_109.txt @@ -0,0 +1 @@ +A helper class transformers.test_utils.TestCasePlus is best used for such purposes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_110.txt b/chunked/content_aware_chunking/_testing/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a09dd65954ae12ee070f3a9246163b4c31255a6 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_110.txt @@ -0,0 +1,2 @@ +It's a sub-class of +unittest.TestCase, so we can easily inherit from it in the test modules. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_111.txt b/chunked/content_aware_chunking/_testing/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..137eb24a676b4b24f51057cec1c0d14ffea43b79 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_111.txt @@ -0,0 +1,8 @@ +Here is an example of its usage: +thon +from transformers.testing_utils import TestCasePlus +class ExamplesTests(TestCasePlus): + def test_whatever(self): + tmp_dir = self.get_auto_remove_tmp_dir() + +This code creates a unique temporary directory, and sets tmp_dir to its location. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_112.txt b/chunked/content_aware_chunking/_testing/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccf9a67a0cf3bea28208e33a12e4c4d43c98b583 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_112.txt @@ -0,0 +1,6 @@ +Create a unique temporary dir: + +python +def test_whatever(self): + tmp_dir = self.get_auto_remove_tmp_dir() +tmp_dir will contain the path to the created temporary dir. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_113.txt b/chunked/content_aware_chunking/_testing/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef274eebf9cf1098f4a320ff28c2275a1da367a1 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_113.txt @@ -0,0 +1,2 @@ +It will be automatically removed at the end of the +test. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_114.txt b/chunked/content_aware_chunking/_testing/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..c59d041c061cfd10c6ecd17415bc6370e05f28d0 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_114.txt @@ -0,0 +1 @@ +Create a temporary dir of my choice, ensure it's empty before the test starts and don't empty it after the test. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_115.txt b/chunked/content_aware_chunking/_testing/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f8f351466c9968a6dd8c5accfb65fd391119c52 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_115.txt @@ -0,0 +1,5 @@ +python +def test_whatever(self): + tmp_dir = self.get_auto_remove_tmp_dir("./xxx") +This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests didn't +leave any data in there. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_116.txt b/chunked/content_aware_chunking/_testing/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f20c60d076c62b716bfb48311db0017da1b045e --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_116.txt @@ -0,0 +1,4 @@ +You can override the default behavior by directly overriding the before and after args, leading to one of the + following behaviors: + +before=True: the temporary dir will always be cleared at the beginning of the test. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_117.txt b/chunked/content_aware_chunking/_testing/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..072e835f658724b97c1be3c29bcd9d119e0f7a99 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_117.txt @@ -0,0 +1 @@ +before=False: if the temporary dir already existed, any existing files will remain there. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_118.txt b/chunked/content_aware_chunking/_testing/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bff6d39ac9f7aa798e98891bb91dec94abe3cd3 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_118.txt @@ -0,0 +1 @@ +after=True: the temporary dir will always be deleted at the end of the test. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_119.txt b/chunked/content_aware_chunking/_testing/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..15abe81d18c5f7f44e927e09f9e31b25b860d433 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_119.txt @@ -0,0 +1 @@ +after=False: the temporary dir will always be left intact at the end of the test. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_120.txt b/chunked/content_aware_chunking/_testing/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd184a7377104ad6847a0db0fa1e07b26d73c0e9 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_120.txt @@ -0,0 +1,3 @@ +In order to run the equivalent of rm -r safely, only subdirs of the project repository checkout are allowed if +an explicit tmp_dir is used, so that by mistake no /tmp or similar important part of the filesystem will +get nuked. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_121.txt b/chunked/content_aware_chunking/_testing/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..431383898e55ab6d4a54f88c2ed6f25fc0ebaf70 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_121.txt @@ -0,0 +1 @@ +i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_122.txt b/chunked/content_aware_chunking/_testing/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..5154ddda9176195de2e3b2b4437fbc1d99f9294d --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_122.txt @@ -0,0 +1 @@ +please always pass paths that start with ./. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_123.txt b/chunked/content_aware_chunking/_testing/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f172aec022ceaf44cd1c62f95e823bdab8b3b40 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_123.txt @@ -0,0 +1,2 @@ +Each test can register multiple temporary directories and they all will get auto-removed, unless requested +otherwise. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_124.txt b/chunked/content_aware_chunking/_testing/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cb4412703a14838586f206c995fccb92db3d31b --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_124.txt @@ -0,0 +1,3 @@ +Temporary sys.path override +If you need to temporary override sys.path to import from another test for example, you can use the +ExtendSysPath context manager. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_125.txt b/chunked/content_aware_chunking/_testing/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..65eb50e73724e7f7bdbe099d34f601f5c8d19bba --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_125.txt @@ -0,0 +1,10 @@ +Example: +thon +import os +from transformers.testing_utils import ExtendSysPath +bindir = os.path.abspath(os.path.dirname(file)) +with ExtendSysPath(f"{bindir}/.."): + from test_trainer import TrainerIntegrationCommon # noqa + +Skipping tests +This is useful when a bug is found and a new test is written, yet the bug is not fixed yet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_126.txt b/chunked/content_aware_chunking/_testing/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..2128801e7b555281d67fc09d5b9f6ad1de25761d --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_126.txt @@ -0,0 +1,2 @@ +In order to be able to +commit it to the main repository we need make sure it's skipped during make test. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_127.txt b/chunked/content_aware_chunking/_testing/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..12d1351bef2195a3fdd9d80704653217cc05f0a0 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_127.txt @@ -0,0 +1,4 @@ +Methods: + +A skip means that you expect your test to pass only if some conditions are met, otherwise pytest should skip + running the test altogether. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_128.txt b/chunked/content_aware_chunking/_testing/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6b21370cc4a285468c81f5a0647bd8781ea1032 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_128.txt @@ -0,0 +1,2 @@ +Common examples are skipping windows-only tests on non-windows platforms, or skipping + tests that depend on an external resource which is not available at the moment (for example a database). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_129.txt b/chunked/content_aware_chunking/_testing/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..42a25611e79850fe3c1fb7215d90c46797313ae3 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_129.txt @@ -0,0 +1 @@ +A xfail means that you expect a test to fail for some reason. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_130.txt b/chunked/content_aware_chunking/_testing/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..82672ab95eb2c93e27e13da190041d57c9216fe9 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_130.txt @@ -0,0 +1,2 @@ +A common example is a test for a feature not yet + implemented, or a bug not yet fixed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_131.txt b/chunked/content_aware_chunking/_testing/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..cba5889d0eeda5a614b9cca4d7a3facd96a11e6f --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_131.txt @@ -0,0 +1,2 @@ +When a test passes despite being expected to fail (marked with + pytest.mark.xfail), it’s an xpass and will be reported in the test summary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_132.txt b/chunked/content_aware_chunking/_testing/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..d590be35efa4c520a98bc13275565b20b638eb90 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_132.txt @@ -0,0 +1 @@ +One of the important differences between the two is that skip doesn't run the test, and xfail does. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_133.txt b/chunked/content_aware_chunking/_testing/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..6bd8f2039387cbd26fdd22789e762d70767d5a66 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_133.txt @@ -0,0 +1,2 @@ +So if the +code that's buggy causes some bad state that will affect other tests, do not use xfail. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_134.txt b/chunked/content_aware_chunking/_testing/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..59a8de03849b3528e1f993e82bb0052aec57a076 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_134.txt @@ -0,0 +1,50 @@ +Implementation + +Here is how to skip whole test unconditionally: + +python no-style +@unittest.skip("this bug needs to be fixed") +def test_feature_x(): +or via pytest: +python no-style +@pytest.mark.skip(reason="this bug needs to be fixed") +or the xfail way: +python no-style +@pytest.mark.xfail +def test_feature_x(): +Here's how to skip a test based on internal checks within the test: +python +def test_feature_x(): + if not has_something(): + pytest.skip("unsupported configuration") +or the whole module: +thon +import pytest +if not pytest.config.getoption("--custom-flag"): + pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True) + +or the xfail way: +python +def test_feature_x(): + pytest.xfail("expected to fail until bug XYZ is fixed") + +Here is how to skip all tests in a module if some import is missing: + +python +docutils = pytest.importorskip("docutils", minversion="0.3") + +Skip a test based on a condition: + +python no-style +@pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher") +def test_feature_x(): +or: +python no-style +@unittest.skipIf(torch_device == "cpu", "Can't do half precision") +def test_feature_x(): +or skip the whole module: +python no-style +@pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows") +class TestClass(): + def test_feature_x(self): +More details, example and ways are here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_135.txt b/chunked/content_aware_chunking/_testing/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..76785746a17fbe3cecfaa1f1469c759cc1e31438 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_135.txt @@ -0,0 +1,3 @@ +Slow tests +The library of tests is ever-growing, and some of the tests take minutes to run, therefore we can't afford waiting for +an hour for the test suite to complete on CI. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_136.txt b/chunked/content_aware_chunking/_testing/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..434fd2726b4935dc0e9274b9a2317a48d1bdf35e --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_136.txt @@ -0,0 +1,7 @@ +Therefore, with some exceptions for essential tests, slow tests should be +marked as in the example below: +python no-style +from transformers.testing_utils import slow +@slow +def test_integration_foo(): +Once a test is marked as @slow, to run such tests set RUN_SLOW=1 env var, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_137.txt b/chunked/content_aware_chunking/_testing/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..1718ce7c2645ab217c4d71f41e228a46fef3ae61 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_137.txt @@ -0,0 +1,5 @@ +: + +RUN_SLOW=1 pytest tests +Some decorators like @parameterized rewrite test names, therefore @slow and the rest of the skip decorators +@require_* have to be listed last for them to work correctly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_138.txt b/chunked/content_aware_chunking/_testing/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..06f79aab5f6045b35072e79e70ad944c8f8a99bd --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_138.txt @@ -0,0 +1,7 @@ +Here is an example of the correct usage: +python no-style +@parameterized.expand() +@slow +def test_integration_foo(): +As explained at the beginning of this document, slow tests get to run on a scheduled basis, rather than in PRs CI +checks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_139.txt b/chunked/content_aware_chunking/_testing/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..5bd1e06dc14e80968636e47a9ee7bc21406e4dc3 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_139.txt @@ -0,0 +1 @@ +So it's possible that some problems will be missed during a PR submission and get merged. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_140.txt b/chunked/content_aware_chunking/_testing/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d18b5ed71d9bfcc974682f3794ed3b5bfd658db --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_140.txt @@ -0,0 +1,2 @@ +Such problems will +get caught during the next scheduled CI job. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_141.txt b/chunked/content_aware_chunking/_testing/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd2a645ae9056bfb7fc0edf7943ed592d8ae48b7 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_141.txt @@ -0,0 +1,2 @@ +But it also means that it's important to run the slow tests on your +machine before submitting the PR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_142.txt b/chunked/content_aware_chunking/_testing/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..94a59835cf1868fa65b2876a949fe7342b4cd8a0 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_142.txt @@ -0,0 +1,3 @@ +Here is a rough decision making mechanism for choosing which tests should be marked as slow: +If the test is focused on one of the library's internal components (e.g., modeling files, tokenization files, +pipelines), then we should run that test in the non-slow test suite. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_143.txt b/chunked/content_aware_chunking/_testing/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0f7748820cf71a0c71460b4b92e00924523ebc5 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_143.txt @@ -0,0 +1,2 @@ +If it's focused on an other aspect of the library, +such as the documentation or the examples, then we should run these tests in the slow test suite. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_144.txt b/chunked/content_aware_chunking/_testing/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..72efeb46824474fa2bdfe18eaa164c81a2f517de --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_144.txt @@ -0,0 +1,5 @@ +And then, to refine +this approach we should have exceptions: + +All tests that need to download a heavy set of weights or a dataset that is larger than ~50MB (e.g., model or + tokenizer integration tests, pipeline integration tests) should be set to slow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_145.txt b/chunked/content_aware_chunking/_testing/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..15aef0e6e0cabec308b2a8dee915047dd61019dc --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_145.txt @@ -0,0 +1,2 @@ +If you're adding a new model, you + should create and upload to the hub a tiny version of it (with random weights) for integration tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_146.txt b/chunked/content_aware_chunking/_testing/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8f6a72ee3d14ce04bf07d89e545ae220090829c --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_146.txt @@ -0,0 +1,2 @@ +This is + discussed in the following paragraphs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_147.txt b/chunked/content_aware_chunking/_testing/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..82a3b6f47821785489268e21d16ab8ea5beaca12 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_147.txt @@ -0,0 +1 @@ +All tests that need to do a training not specifically optimized to be fast should be set to slow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_148.txt b/chunked/content_aware_chunking/_testing/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..4314d7f427c2ca61f0d9accb1d741daa13786087 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_148.txt @@ -0,0 +1,2 @@ +We can introduce exceptions if some of these should-be-non-slow tests are excruciatingly slow, and set them to + @slow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_149.txt b/chunked/content_aware_chunking/_testing/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..55a35f67af0f6a343fba0cced7455380de8c6720 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_149.txt @@ -0,0 +1,2 @@ +Auto-modeling tests, which save and load large files to disk, are a good example of tests that are marked + as @slow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_150.txt b/chunked/content_aware_chunking/_testing/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..40ab870bf973cc624a47844b1262714613765b0a --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_150.txt @@ -0,0 +1 @@ +If a test completes under 1 second on CI (including downloads if any) then it should be a normal test regardless. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_151.txt b/chunked/content_aware_chunking/_testing/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3f0c424aa34c4a624546a0d5490dd6712dc82c0 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_151.txt @@ -0,0 +1 @@ +Collectively, all the non-slow tests need to cover entirely the different internals, while remaining fast. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_152.txt b/chunked/content_aware_chunking/_testing/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..129e0404267723b49b4c1486deb44c8b6d26e694 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_152.txt @@ -0,0 +1,2 @@ +For example, +a significant coverage can be achieved by testing with specially created tiny models with random weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_153.txt b/chunked/content_aware_chunking/_testing/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..90f701f9cbb0d0913bceda6fc67bec0e0d412713 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_153.txt @@ -0,0 +1,2 @@ +Such models +have the very minimal number of layers (e.g., 2), vocab size (e.g., 1000), etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_154.txt b/chunked/content_aware_chunking/_testing/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..16d50b0801b6a6fee92e8834a716d36a58887b66 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_154.txt @@ -0,0 +1,2 @@ +Then the @slow tests can use large +slow models to do qualitative testing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_155.txt b/chunked/content_aware_chunking/_testing/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..a429763ab476c1ee6cca0d17a5d030da719ec723 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_155.txt @@ -0,0 +1,5 @@ +To see the use of these simply look for tiny models with: + +grep tiny tests examples +Here is a an example of a script that created the tiny model +stas/tiny-wmt19-en-de. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_156.txt b/chunked/content_aware_chunking/_testing/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a8e75734eef00731dc31ba8195c930b12566b31 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_156.txt @@ -0,0 +1,2 @@ +You can easily adjust it to your specific +model's architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_157.txt b/chunked/content_aware_chunking/_testing/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..b316cebefe8e6417bacdf68dd9545e65f4e7bde6 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_157.txt @@ -0,0 +1,2 @@ +It's easy to measure the run-time incorrectly if for example there is an overheard of downloading a huge model, but if +you test it locally the downloaded files would be cached and thus the download time not measured. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_158.txt b/chunked/content_aware_chunking/_testing/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..af20c03f40c9c9c9a3d4d0a43d0e5e0f3a970671 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_158.txt @@ -0,0 +1,2 @@ +Hence check the +execution speed report in CI logs instead (the output of pytest --durations=0 tests). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_159.txt b/chunked/content_aware_chunking/_testing/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..43cc3fd09074530a95dd00f6bfcf675304ce8ea0 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_159.txt @@ -0,0 +1 @@ +That report is also useful to find slow outliers that aren't marked as such, or which need to be re-written to be fast. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_160.txt b/chunked/content_aware_chunking/_testing/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..237e6a22bcea27f9bba3d0c0bf69ced961a9eda5 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_160.txt @@ -0,0 +1,2 @@ +If you notice that the test suite starts getting slow on CI, the top listing of this report will show the slowest +tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_161.txt b/chunked/content_aware_chunking/_testing/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5e585521f4f334d1178655b260f4b5d6561257e --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_161.txt @@ -0,0 +1,3 @@ +Testing the stdout/stderr output +In order to test functions that write to stdout and/or stderr, the test can access those streams using the +pytest's capsys system. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_162.txt b/chunked/content_aware_chunking/_testing/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bf22695dc406f7d71669d30947d3c3884fea740 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_162.txt @@ -0,0 +1,52 @@ +Here is how this is accomplished: +thon +import sys +def print_to_stdout(s): + print(s) +def print_to_stderr(s): + sys.stderr.write(s) +def test_result_and_stdout(capsys): + msg = "Hello" + print_to_stdout(msg) + print_to_stderr(msg) + out, err = capsys.readouterr() # consume the captured output streams + # optional: if you want to replay the consumed streams: + sys.stdout.write(out) + sys.stderr.write(err) + # test: + assert msg in out + assert msg in err + +And, of course, most of the time, stderr will come as a part of an exception, so try/except has to be used in such +a case: +thon +def raise_exception(msg): + raise ValueError(msg) +def test_something_exception(): + msg = "Not a good value" + error = "" + try: + raise_exception(msg) + except Exception as e: + error = str(e) + assert msg in error, f"{msg} is in the exception:\n{error}" + +Another approach to capturing stdout is via contextlib.redirect_stdout: +thon +from io import StringIO +from contextlib import redirect_stdout +def print_to_stdout(s): + print(s) +def test_result_and_stdout(): + msg = "Hello" + buffer = StringIO() + with redirect_stdout(buffer): + print_to_stdout(msg) + out = buffer.getvalue() + # optional: if you want to replay the consumed streams: + sys.stdout.write(out) + # test: + assert msg in out + +An important potential issue with capturing stdout is that it may contain \r characters that in normal print +reset everything that has been printed so far. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_163.txt b/chunked/content_aware_chunking/_testing/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..e653dfca82da5c47eec44ca7221f6eb6a162c32c --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_163.txt @@ -0,0 +1,3 @@ +There is no problem with pytest, but with pytest -s these +characters get included in the buffer, so to be able to have the test run with and without -s, you have to make an +extra cleanup to the captured output, using re.sub(r'~. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_164.txt b/chunked/content_aware_chunking/_testing/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5ea3b7003841f11d790297c40da52380556e897 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_164.txt @@ -0,0 +1 @@ +*\r', '', buf, 0, re.M). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_165.txt b/chunked/content_aware_chunking/_testing/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..88c388cf7199dec8aa402355c28963d6d8c79af2 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_165.txt @@ -0,0 +1,33 @@ +But, then we have a helper context manager wrapper to automatically take care of it all, regardless of whether it has +some \r's in it or not, so it's a simple: +thon +from transformers.testing_utils import CaptureStdout +with CaptureStdout() as cs: + function_that_writes_to_stdout() +print(cs.out) + +Here is a full test example: +thon +from transformers.testing_utils import CaptureStdout +msg = "Secret message\r" +final = "Hello World" +with CaptureStdout() as cs: + print(msg + final) +assert cs.out == final + "\n", f"captured: {cs.out}, expecting {final}" + +If you'd like to capture stderr use the CaptureStderr class instead: +thon +from transformers.testing_utils import CaptureStderr +with CaptureStderr() as cs: + function_that_writes_to_stderr() +print(cs.err) + +If you need to capture both streams at once, use the parent CaptureStd class: +thon +from transformers.testing_utils import CaptureStd +with CaptureStd() as cs: + function_that_writes_to_stdout_and_stderr() +print(cs.err, cs.out) + +Also, to aid debugging test issues, by default these context managers automatically replay the captured streams on exit +from the context. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_166.txt b/chunked/content_aware_chunking/_testing/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..caf01e323c96fc76eef9ca8d16db5e0158e94425 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_166.txt @@ -0,0 +1,24 @@ +Capturing logger stream +If you need to validate the output of a logger, you can use CaptureLogger: +thon +from transformers import logging +from transformers.testing_utils import CaptureLogger +msg = "Testing 1, 2, 3" +logging.set_verbosity_info() +logger = logging.get_logger("transformers.models.bart.tokenization_bart") +with CaptureLogger(logger) as cl: + logger.info(msg) +assert cl.out, msg + "\n" + +Testing with environment variables +If you want to test the impact of environment variables for a specific test you can use a helper decorator +transformers.testing_utils.mockenv +thon +from transformers.testing_utils import mockenv +class HfArgumentParserTest(unittest.TestCase): + @mockenv(TRANSFORMERS_VERBOSITY="error") + def test_env_override(self): + env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None) + +At times an external program needs to be called, which requires setting PYTHONPATH in os.environ to include +multiple local paths. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_167.txt b/chunked/content_aware_chunking/_testing/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..05b73944beb2a8b6480b1f693cc7e836acbcc4d2 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_167.txt @@ -0,0 +1,12 @@ +A helper class transformers.test_utils.TestCasePlus comes to help: +thon +from transformers.testing_utils import TestCasePlus +class EnvExampleTest(TestCasePlus): + def test_external_prog(self): + env = self.get_env() + # now call the external program, passing env to it + +Depending on whether the test file was under the tests test suite or examples it'll correctly set up +env[PYTHONPATH] to include one of these two directories, and also the src directory to ensure the testing is +done against the current repo, and finally with whatever env[PYTHONPATH] was already set to before the test was +called if anything. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_168.txt b/chunked/content_aware_chunking/_testing/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..a94a3a7521425e9f2a6f418b478c0fe2fb88cc1a --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_168.txt @@ -0,0 +1 @@ +This helper method creates a copy of the os.environ object, so the original remains intact. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_169.txt b/chunked/content_aware_chunking/_testing/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..32a4107a4aa860a10b307bf6fe55b0d01cbb108e --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_169.txt @@ -0,0 +1,2 @@ +Getting reproducible results +In some situations you may want to remove randomness for your tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_170.txt b/chunked/content_aware_chunking/_testing/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c9cb0131afa02d21bdf27f0f498994a962dca7c --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_170.txt @@ -0,0 +1,27 @@ +To get identical reproducible results set, you +will need to fix the seed: +thon +seed = 42 +python RNG +import random +random.seed(seed) +pytorch RNGs +import torch +torch.manual_seed(seed) +torch.backends.cudnn.deterministic = True +if torch.cuda.is_available(): + torch.cuda.manual_seed_all(seed) +numpy RNG +import numpy as np +np.random.seed(seed) +tf RNG +tf.random.set_seed(seed) + +Debugging tests +To start a debugger at the point of the warning, do this: + +pytest tests/utils/test_logging.py -W error::UserWarning --pdb +Working with github actions workflows +To trigger a self-push workflow CI job, you must: + +Create a new branch on transformers origin (not a fork!). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_171.txt b/chunked/content_aware_chunking/_testing/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5c2fe612ca92cf089b3541cf460d2fefe1ad0d1 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_171.txt @@ -0,0 +1,2 @@ +The branch name has to start with either ci_ or ci- (main triggers it too, but we can't do PRs on + main). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_172.txt b/chunked/content_aware_chunking/_testing/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..a915a2467e66c1b08c6c3bc67d2b797bff6afba9 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_172.txt @@ -0,0 +1,3 @@ +It also gets triggered only for specific paths - you can find the up-to-date definition in case it + changed since this document has been written here under push: +Create a PR from this branch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_173.txt b/chunked/content_aware_chunking/_testing/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..8db392a90ef8528c9706299b6c5b2c10bd2561dd --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_173.txt @@ -0,0 +1 @@ +Then you can see the job appear here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_174.txt b/chunked/content_aware_chunking/_testing/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e3d1a3bd8cd31299eab513085933f42095c35bb --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_174.txt @@ -0,0 +1,2 @@ +It may not run right away if there + is a backlog. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_175.txt b/chunked/content_aware_chunking/_testing/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd96647aac383e0e73a408c4696031436b7dcd81 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_175.txt @@ -0,0 +1,2 @@ +Testing Experimental CI Features +Testing CI features can be potentially problematic as it can interfere with the normal CI functioning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_176.txt b/chunked/content_aware_chunking/_testing/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..92361fdd7196576a98097c1e463c9d505fc5b1ec --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_176.txt @@ -0,0 +1,2 @@ +Therefore if a +new CI feature is to be added, it should be done as following. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_177.txt b/chunked/content_aware_chunking/_testing/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..8996061580111fe949fe4cbd2089ec13be527e3a --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_177.txt @@ -0,0 +1,2 @@ +Create a new dedicated job that tests what needs to be tested +The new job must always succeed so that it gives us a green ✓ (details below). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_178.txt b/chunked/content_aware_chunking/_testing/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..02b692419ce4fa4ddb11c5d591e773e4babe5915 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_178.txt @@ -0,0 +1,2 @@ +Let it run for some days to see that a variety of different PR types get to run on it (user fork branches, + non-forked branches, branches originating from github.com UI direct file edit, various forced pushes, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_179.txt b/chunked/content_aware_chunking/_testing/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..e727280ab6d8aee99e69cccf5d56dfbc689b561c --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_179.txt @@ -0,0 +1,4 @@ +- there + are so many) while monitoring the experimental job's logs (not the overall job green as it's purposefully always + green) +When it's clear that everything is solid, then merge the new changes into existing jobs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_180.txt b/chunked/content_aware_chunking/_testing/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fc46d8a3e659afa72fd9ee2feaf1dde123b0481 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_180.txt @@ -0,0 +1 @@ +That way experiments on CI functionality itself won't interfere with the normal workflow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_181.txt b/chunked/content_aware_chunking/_testing/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bb99b940b9c4317e420fc70b7c7a8cc80e6c68d --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_181.txt @@ -0,0 +1 @@ +Now how can we make the job always succeed while the new CI feature is being developed? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_182.txt b/chunked/content_aware_chunking/_testing/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..b66094c31b18b4cc6e2027f55555ea510616610b --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_182.txt @@ -0,0 +1,2 @@ +Some CIs, like TravisCI support ignore-step-failure and will report the overall job as successful, but CircleCI and +Github Actions as of this writing don't support that. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_183.txt b/chunked/content_aware_chunking/_testing/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d5f1d686522ab06a00a3368e7f62c2c2c9c6e70 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_183.txt @@ -0,0 +1,3 @@ +So the following workaround can be used: + +set +euo pipefail at the beginning of the run command to suppress most potential failures in the bash script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_184.txt b/chunked/content_aware_chunking/_testing/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..b30abdf5e732412fce9ce4f87e096af45ceec872 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_184.txt @@ -0,0 +1,21 @@ +the last command must be a success: echo "done" or just true will do + +Here is an example: +yaml +- run: + name: run CI experiment + command: | + set +euo pipefail + echo "setting run-all-despite-any-errors-mode" + this_command_will_fail + echo "but bash continues to run" + # emulate another failure + false + # but the last command must be a success + echo "during experiment do not remove: reporting success to CI, even if there were failures" +For simple commands you could also do: + +cmd_that_may_fail || true +Of course, once satisfied with the results, integrate the experimental step or job with the rest of the normal jobs, +while removing set +euo pipefail or any other things you may have added to ensure that the experimental job doesn't +interfere with the normal CI functioning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_185.txt b/chunked/content_aware_chunking/_testing/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..2431c9fe31560b42ba123c3824d658df4ca7af5a --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_185.txt @@ -0,0 +1,2 @@ +This whole process would have been much easier if we only could set something like allow-failure for the +experimental step, and let it fail without impacting the overall status of PRs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_186.txt b/chunked/content_aware_chunking/_testing/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..b942fb7ac1e218617797c9cf2be8886ee987d7c4 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_186.txt @@ -0,0 +1,2 @@ +But as mentioned earlier CircleCI and +Github Actions don't support it at the moment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_187.txt b/chunked/content_aware_chunking/_testing/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b355d1fc21f2e4f620f7f8f9198b15c7a7c5e3e --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_187.txt @@ -0,0 +1,7 @@ +You can vote for this feature and see where it is at these CI-specific threads: + +Github Actions: +CircleCI: + +DeepSpeed integration +For a PR that involves the DeepSpeed integration, keep in mind our CircleCI PR CI setup doesn't have GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_188.txt b/chunked/content_aware_chunking/_testing/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5ce6760f86a18671f7853634d2593ef819335a3 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_188.txt @@ -0,0 +1 @@ +Tests requiring GPUs are run on a different CI nightly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_189.txt b/chunked/content_aware_chunking/_testing/chunk_189.txt new file mode 100644 index 0000000000000000000000000000000000000000..d62cf082ffbebcc1397af994c11a338851e1aa3a --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_189.txt @@ -0,0 +1 @@ +This means if you get a passing CI report in your PR, it doesn’t mean the DeepSpeed tests pass. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_190.txt b/chunked/content_aware_chunking/_testing/chunk_190.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a716dc380b4fc9a3803b4087ccd1dc8092dbcbf --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_190.txt @@ -0,0 +1,4 @@ +To run DeepSpeed tests: + +RUN_SLOW=1 pytest tests/deepspeed/test_deepspeed.py +Any changes to the modeling or PyTorch examples code requires running the model zoo tests as well. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_191.txt b/chunked/content_aware_chunking/_testing/chunk_191.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8e8ee75ac37319acc6809fa1b48ca12a9aa7628 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_191.txt @@ -0,0 +1 @@ +RUN_SLOW=1 pytest tests/deepspeed \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_67.txt b/chunked/content_aware_chunking/_testing/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d3e4076b69c29e7c10ba98f4db95723c631aac8 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_67.txt @@ -0,0 +1 @@ +It can also be used to achieve the same effect as CUDA_VISIBLE_DEVICES by targeting specific GPUs or testing in CPU-only mode. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_68.txt b/chunked/content_aware_chunking/_testing/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..97df4e20e1d14a5e7e1cdeec9e6450eed1f58b62 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_68.txt @@ -0,0 +1 @@ +Certain devices will require an additional import after importing torch for the first time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_69.txt b/chunked/content_aware_chunking/_testing/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..22d48d9a82825eaca0d684aa63d0d0521a6bbeb9 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_69.txt @@ -0,0 +1,4 @@ +This can be specified using the environment variable TRANSFORMERS_TEST_BACKEND: + +TRANSFORMERS_TEST_BACKEND="torch_npu" pytest tests/utils/test_logging.py +Alternative backends may also require the replacement of device-specific functions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_70.txt b/chunked/content_aware_chunking/_testing/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..65d44e9cc0f9ee390a5bc9e5513b9a8dbd687caa --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_70.txt @@ -0,0 +1 @@ +For example torch.cuda.manual_seed may need to be replaced with a device-specific seed setter like torch.npu.manual_seed to correctly set a random seed on the device. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_71.txt b/chunked/content_aware_chunking/_testing/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa6005942798e1d4c699aed01671fde4df497856 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_71.txt @@ -0,0 +1,5 @@ +To specify a new backend with backend-specific device functions when running the test suite, create a Python device specification file in the format: + +import torch +import torch_npu +! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_72.txt b/chunked/content_aware_chunking/_testing/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..74e0f12e3246e5d0b556558359a30e0991092cdc --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_72.txt @@ -0,0 +1 @@ +! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_73.txt b/chunked/content_aware_chunking/_testing/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..25706ae4c70f3f967301cfd2d6ec561ce5d594b8 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_73.txt @@ -0,0 +1 @@ +Further additional imports can be added here ! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_74.txt b/chunked/content_aware_chunking/_testing/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..74e0f12e3246e5d0b556558359a30e0991092cdc --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_74.txt @@ -0,0 +1 @@ +! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_75.txt b/chunked/content_aware_chunking/_testing/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..e62d900d38ab30f855851c110dc046cbc3b1e4b6 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_75.txt @@ -0,0 +1 @@ +Specify the device name (eg. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_76.txt b/chunked/content_aware_chunking/_testing/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0a48d269f76d59ee84d3c6a8d589f42215265b3 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_76.txt @@ -0,0 +1,3 @@ +'cuda', 'cpu', 'npu') +DEVICE_NAME = 'npu' +Specify device-specific backends to dispatch to. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_77.txt b/chunked/content_aware_chunking/_testing/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae11a95511656b0eec026c5ba544dc298acc39c4 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_77.txt @@ -0,0 +1,6 @@ +If not specified, will fallback to 'default' in 'testing_utils.py` +MANUAL_SEED_FN = torch.npu.manual_seed +EMPTY_CACHE_FN = torch.npu.empty_cache +DEVICE_COUNT_FN = torch.npu.device_count +`` +This format also allows for specification of any additional imports required. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_78.txt b/chunked/content_aware_chunking/_testing/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b9b6ddae1a5a040523f0aba63faf21885fd16ed --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_78.txt @@ -0,0 +1 @@ +To use this file to replace equivalent methods in the test suite, set the environment variableTRANSFORMERS_TEST_DEVICE_SPEC` to the path of the spec file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_79.txt b/chunked/content_aware_chunking/_testing/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ad1b9892f5848f65f9c9d67ebda44a76a8b664d --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_79.txt @@ -0,0 +1 @@ +Currently, only MANUAL_SEED_FN, EMPTY_CACHE_FN and DEVICE_COUNT_FN are supported for device-specific dispatch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_80.txt b/chunked/content_aware_chunking/_testing/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e0fcb8ae4c471111da5db9e1347dcecdfb6aac2 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_80.txt @@ -0,0 +1,2 @@ +Distributed training +pytest can't deal with distributed training directly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_81.txt b/chunked/content_aware_chunking/_testing/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..68da07f45339213e25287f2239b9782a0826b97e --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_81.txt @@ -0,0 +1,2 @@ +If this is attempted - the sub-processes don't do the right +thing and end up thinking they are pytest and start running the test suite in loops. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_82.txt b/chunked/content_aware_chunking/_testing/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..a81c8cfc7677ef84fb73661022332a88edf19403 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_82.txt @@ -0,0 +1,2 @@ +It works, however, if one +spawns a normal process that then spawns off multiple workers and manages the IO pipes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_83.txt b/chunked/content_aware_chunking/_testing/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..27f204c66408b06979e72f696066c04a22a1734e --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_83.txt @@ -0,0 +1,6 @@ +Here are some tests that use it: + +test_trainer_distributed.py +test_deepspeed.py + +To jump right into the execution point, search for the execute_subprocess_async call in those tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_84.txt b/chunked/content_aware_chunking/_testing/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbbfb7d23661b52e2e1571a73da7c2fe027b3334 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_84.txt @@ -0,0 +1,5 @@ +You will need at least 2 GPUs to see these tests in action: + +CUDA_VISIBLE_DEVICES=0,1 RUN_SLOW=1 pytest -sv tests/test_trainer_distributed.py +Output capture +During test execution any output sent to stdout and stderr is captured. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_85.txt b/chunked/content_aware_chunking/_testing/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d7191605c5ca5746d6753761f8363667fc04337 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_85.txt @@ -0,0 +1,2 @@ +If a test or a setup method fails, its +according captured output will usually be shown along with the failure traceback. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_86.txt b/chunked/content_aware_chunking/_testing/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..8030f9731739b2ddc8a718c383f05c8b3d140109 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_86.txt @@ -0,0 +1,15 @@ +To disable output capturing and to get the stdout and stderr normally, use -s or --capture=no: + +pytest -s tests/utils/test_logging.py +To send test results to JUnit format output: + +py.test tests --junitxml=result.xml +Color control +To have no color (e.g., yellow on white background is not readable): + +pytest --color=no tests/utils/test_logging.py +Sending test report to online pastebin service +Creating a URL for each test failure: + +pytest --pastebin=failed tests/utils/test_logging.py +This will submit test run information to a remote Paste service and provide a URL for each failure. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_87.txt b/chunked/content_aware_chunking/_testing/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..176f6db336678417f63ee97470ecc9d0b632be18 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_87.txt @@ -0,0 +1,2 @@ +You may select +tests as usual or add for example -x if you only want to send one particular failure. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_88.txt b/chunked/content_aware_chunking/_testing/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..c809918ba120bb05b2985609a1c0df1e1b01b9e3 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_88.txt @@ -0,0 +1,6 @@ +Creating a URL for a whole test session log: + +pytest --pastebin=all tests/utils/test_logging.py +Writing tests +🤗 transformers tests are based on unittest, but run by pytest, so most of the time features from both systems +can be used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_89.txt b/chunked/content_aware_chunking/_testing/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb39bcebe85d01e66c379711dfc5599a1cb06aff --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_89.txt @@ -0,0 +1,2 @@ +You can read here which features are supported, but the important +thing to remember is that most pytest fixtures don't work. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_90.txt b/chunked/content_aware_chunking/_testing/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e31865775c6ea1f932623acd4259552be88f2a3 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_90.txt @@ -0,0 +1,2 @@ +Neither parametrization, but we use the module +parameterized that works in a similar way. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_91.txt b/chunked/content_aware_chunking/_testing/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bb44b28aed932741a3f3d9bcca2a3683925266e --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_91.txt @@ -0,0 +1,2 @@ +Parametrization +Often, there is a need to run the same test multiple times, but with different arguments. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_92.txt b/chunked/content_aware_chunking/_testing/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..569c8bd14b655c2cba59cbc54635803b5d292ec8 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_92.txt @@ -0,0 +1,2 @@ +It could be done from within +the test, but then there is no way of running that test for just one set of arguments. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_93.txt b/chunked/content_aware_chunking/_testing/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbd57abafc2b3420e7c8c2aa896bd55f9a656cc7 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_93.txt @@ -0,0 +1,17 @@ +thon +test_this1.py +import unittest +from parameterized import parameterized +class TestMathUnitTest(unittest.TestCase): + @parameterized.expand( + [ + ("negative", -1.5, -2.0), + ("integer", 1, 1.0), + ("large fraction", 1.6, 1), + ] + ) + def test_floor(self, name, input, expected): + assert_equal(math.floor(input), expected) + +Now, by default this test will be run 3 times, each time with the last 3 arguments of test_floor being assigned the +corresponding arguments in the parameter list. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_94.txt b/chunked/content_aware_chunking/_testing/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e6a5c83c47c9c3063045a43be2cfbcac88c2d74 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_94.txt @@ -0,0 +1,8 @@ +and you could run just the negative and integer sets of params with: + +pytest -k "negative and integer" tests/test_mytest.py +or all but negative sub-tests, with: + +pytest -k "not negative" tests/test_mytest.py +Besides using the -k filter that was just mentioned, you can find out the exact name of each sub-test and run any +or all of them using their exact names. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_95.txt b/chunked/content_aware_chunking/_testing/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e9bacbfd9136789d918cb77d2be415aa68eaf38 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_95.txt @@ -0,0 +1,11 @@ +pytest test_this1.py --collect-only -q +and it will list: + +test_this1.py::TestMathUnitTest::test_floor_0_negative +test_this1.py::TestMathUnitTest::test_floor_1_integer +test_this1.py::TestMathUnitTest::test_floor_2_large_fraction +So now you can run just 2 specific sub-tests: + +pytest test_this1.py::TestMathUnitTest::test_floor_0_negative test_this1.py::TestMathUnitTest::test_floor_1_integer +The module parameterized which is already in the developer dependencies +of transformers works for both: unittests and pytest tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_96.txt b/chunked/content_aware_chunking/_testing/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b1f71f7013415bc94fe4bd6b6ef3f23ae9940de --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_96.txt @@ -0,0 +1,2 @@ +If, however, the test is not a unittest, you may use pytest.mark.parametrize (or you may see it being used in +some existing tests, mostly under examples). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_97.txt b/chunked/content_aware_chunking/_testing/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..46b62ca07b823e2bb4766dd48a32c745d4b5c7e1 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_97.txt @@ -0,0 +1,17 @@ +Here is the same example, this time using pytest's parametrize marker: +thon +test_this2.py +import pytest +@pytest.mark.parametrize( + "name, input, expected", + [ + ("negative", -1.5, -2.0), + ("integer", 1, 1.0), + ("large fraction", 1.6, 1), + ], +) +def test_floor(name, input, expected): + assert_equal(math.floor(input), expected) + +Same as with parameterized, with pytest.mark.parametrize you can have a fine control over which sub-tests are +run, if the -k filter doesn't do the job. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_98.txt b/chunked/content_aware_chunking/_testing/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..35e703ba763a3d8d5ca5d068a6e59bdd4cc64970 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_98.txt @@ -0,0 +1,2 @@ +Except, this parametrization function creates a slightly different set of +names for the sub-tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_testing/chunk_99.txt b/chunked/content_aware_chunking/_testing/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..b16acc0a123b3de48400fdb817f8cc172b5d1e87 --- /dev/null +++ b/chunked/content_aware_chunking/_testing/chunk_99.txt @@ -0,0 +1,12 @@ +Here is what they look like: + +pytest test_this2.py --collect-only -q +and it will list: + +test_this2.py::test_floor[integer-1-1.0] +test_this2.py::test_floor[negative--1.5--2.0] +test_this2.py::test_floor[large fraction-1.6-1] +So now you can run just the specific test: + +pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0] +as in the previous example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_17.txt b/chunked/content_aware_chunking/_tf_xla/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8eb2131fe1856ebb8da602fa56aa1ba7f25f259 --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_17.txt @@ -0,0 +1 @@ +However, there are a couple of gotchas in the above code snippet that are specific to XLA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_18.txt b/chunked/content_aware_chunking/_tf_xla/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..df4747b9ae6e950e7bf755c904979341203121c7 --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_18.txt @@ -0,0 +1 @@ +You need to be aware of those to realize the speed-ups that XLA can bring in. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_19.txt b/chunked/content_aware_chunking/_tf_xla/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8066daf0e31c5645c3abbfda38990534f8b91ed4 --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_19.txt @@ -0,0 +1 @@ +We discuss these in the following section. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_20.txt b/chunked/content_aware_chunking/_tf_xla/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..afc1855548e6e42f5218c7b8d440711a0c52ec9b --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_20.txt @@ -0,0 +1,2 @@ +Gotchas to be aware of +When you are executing an XLA-enabled function (like xla_generate() above) for the first time, it will internally try to infer the computation graph, which is time-consuming. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_21.txt b/chunked/content_aware_chunking/_tf_xla/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ac115eade3a23d4b2dd01fa8a2042020e245799 --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_21.txt @@ -0,0 +1 @@ +This process is known as “tracingâ€. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_22.txt b/chunked/content_aware_chunking/_tf_xla/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b3fa71358f81863bb95d0fde64f806456e6e78f --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_22.txt @@ -0,0 +1 @@ +You might notice that the generation time is not fast. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_23.txt b/chunked/content_aware_chunking/_tf_xla/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..f27b80ed422ffa087fb48b852b318bf22546972b --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_23.txt @@ -0,0 +1 @@ +Successive calls of xla_generate() (or any other XLA-enabled function) won’t have to infer the computation graph, given the inputs to the function follow the same shape with which the computation graph was initially built. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_24.txt b/chunked/content_aware_chunking/_tf_xla/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..3668a096207b4e3a89dbcb02392defd40770edbf --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_24.txt @@ -0,0 +1 @@ +While this is not a problem for modalities with fixed input shapes (e.g., images), you must pay attention if you are working with variable input shape modalities (e.g., text). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_25.txt b/chunked/content_aware_chunking/_tf_xla/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a3b997e068cc50554c1c142bea314960f8a7c12 --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_25.txt @@ -0,0 +1 @@ +To ensure xla_generate() always operates with the same input shapes, you can specify the padding arguments when calling the tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_26.txt b/chunked/content_aware_chunking/_tf_xla/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..37d9ab019a9cb6dcd2906f59a1894249fdd02d9b --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_26.txt @@ -0,0 +1,7 @@ +import tensorflow as tf +from transformers import AutoTokenizer, TFAutoModelForCausalLM +tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="") +model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2") +input_string = ["TensorFlow is"] +xla_generate = tf.function(model.generate, jit_compile=True) +Here, we call the tokenizer with padding options. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_27.txt b/chunked/content_aware_chunking/_tf_xla/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4113c9395ece617313d3de43095cb9107636367 --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_27.txt @@ -0,0 +1,6 @@ +tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf") +generated_tokens = xla_generate(**tokenized_input, num_beams=2) +decoded_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True) +print(f"Generated -- {decoded_text}") + +This way, you can ensure that the inputs to xla_generate() will always receive inputs with the shape it was traced with and thus leading to speed-ups in the generation time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_28.txt b/chunked/content_aware_chunking/_tf_xla/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..d565dbe87f364fab8065a5efa4cca651061e3f58 --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_28.txt @@ -0,0 +1,22 @@ +You can verify this with the code below: + +import time +import tensorflow as tf +from transformers import AutoTokenizer, TFAutoModelForCausalLM +tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2", padding_side="left", pad_token="") +model = TFAutoModelForCausalLM.from_pretrained("openai-community/gpt2") +xla_generate = tf.function(model.generate, jit_compile=True) +for input_string in ["TensorFlow is", "TensorFlow is a", "TFLite is a"]: + tokenized_input = tokenizer(input_string, pad_to_multiple_of=8, padding=True, return_tensors="tf") + start = time.time_ns() + generated_tokens = xla_generate(**tokenized_input, num_beams=2) + end = time.time_ns() + print(f"Execution time -- {(end - start) / 1e6:.1f} ms\n") + +On a Tesla T4 GPU, you can expect the outputs like so: +```bash +Execution time -- 30819.6 ms +Execution time -- 79.0 ms +Execution time -- 78.9 ms +`` +The first call toxla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_29.txt b/chunked/content_aware_chunking/_tf_xla/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..849c49808719c28e3d76c628bb081d895288f69d --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_29.txt @@ -0,0 +1 @@ +Keep in mind that any change in the generation options at any point with trigger re-tracing and thus leading to slow-downs in the generation time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_30.txt b/chunked/content_aware_chunking/_tf_xla/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..9adf0829fbee907215d738653d79cf4c1aad0494 --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_30.txt @@ -0,0 +1 @@ +We didn’t cover all the text generation options 🤗 Transformers provides in this document. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_31.txt b/chunked/content_aware_chunking/_tf_xla/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d2c400d18118cb408becf69bf249a370dcdb61f --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_31.txt @@ -0,0 +1 @@ +We encourage you to read the documentation for advanced use cases. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_32.txt b/chunked/content_aware_chunking/_tf_xla/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..81dde36f1a4bf30a667333bfa7a167b2b86479cd --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_32.txt @@ -0,0 +1,2 @@ +Additional Resources +Here, we leave you with some additional resources if you want to delve deeper into XLA in 🤗 Transformers and in general. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_33.txt b/chunked/content_aware_chunking/_tf_xla/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9c299372c0f120f00a30df697120337750e16bb --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_33.txt @@ -0,0 +1 @@ +This Colab Notebook provides an interactive demonstration if you want to fiddle with the XLA-compatible encoder-decoder (like T5) and decoder-only (like GPT2) text generation models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_34.txt b/chunked/content_aware_chunking/_tf_xla/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbb751adae2e2facd68cdd9219cf13500bfffc4e --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_34.txt @@ -0,0 +1 @@ +This blog post provides an overview of the comparison benchmarks for XLA-compatible models along with a friendly introduction to XLA in TensorFlow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_35.txt b/chunked/content_aware_chunking/_tf_xla/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..83f8e2a39f7236b0ea908a9fa2c018cd3ffc9367 --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_35.txt @@ -0,0 +1 @@ +This blog post discusses our design philosophy behind adding XLA support to the TensorFlow models in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tf_xla/chunk_36.txt b/chunked/content_aware_chunking/_tf_xla/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..d492476d74e5b7381eab2381909d0e1bd2d67de2 --- /dev/null +++ b/chunked/content_aware_chunking/_tf_xla/chunk_36.txt @@ -0,0 +1,4 @@ +Recommended posts for learning more about XLA and TensorFlow graphs in general: +XLA: Optimizing Compiler for Machine Learning +Introduction to graphs and tf.function +Better performance with tf.function \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tflite/chunk_4.txt b/chunked/content_aware_chunking/_tflite/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..118a3a17c6ace7f4e47a4e65e14985fb55c68ba4 --- /dev/null +++ b/chunked/content_aware_chunking/_tflite/chunk_4.txt @@ -0,0 +1 @@ +For the list of supported model architectures, please refer to 🤗 Optimum documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tflite/chunk_5.txt b/chunked/content_aware_chunking/_tflite/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..b144837d04a90e9294f79ba58351b8839a703b96 --- /dev/null +++ b/chunked/content_aware_chunking/_tflite/chunk_5.txt @@ -0,0 +1,19 @@ +To export a model to TFLite, install the required dependencies: + +pip install optimum[exporters-tf] +To check out all available arguments, refer to the 🤗 Optimum docs, +or view help in command line: + +optimum-cli export tflite --help +To export a model's checkpoint from the 🤗 Hub, for example, google-bert/bert-base-uncased, run the following command: + +optimum-cli export tflite --model google-bert/bert-base-uncased --sequence_length 128 bert_tflite/ +You should see the logs indicating progress and showing where the resulting model.tflite is saved, like this: + +Validating TFLite model + -[✓] TFLite model output names match reference model (logits) + - Validating TFLite Model output "logits": + -[✓] (1, 128, 30522) matches (1, 128, 30522) + -[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05) +The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05: +- logits: max diff = 5.817413330078125e-05. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tflite/chunk_6.txt b/chunked/content_aware_chunking/_tflite/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..6541b94e9496f8025205c76a840fdbff5b5ebe1a --- /dev/null +++ b/chunked/content_aware_chunking/_tflite/chunk_6.txt @@ -0,0 +1,2 @@ +The exported model was saved at: bert_tflite +The example above illustrates exporting a checkpoint from 🤗 Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tflite/chunk_7.txt b/chunked/content_aware_chunking/_tflite/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ce17a0bc63785d9eca593486bc3244e2f4ac392 --- /dev/null +++ b/chunked/content_aware_chunking/_tflite/chunk_7.txt @@ -0,0 +1,2 @@ +When exporting a local model, first make sure that you +saved both the model's weights and tokenizer files in the same directory (local_path). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tflite/chunk_8.txt b/chunked/content_aware_chunking/_tflite/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..adcbdce05c74f8d8894a4fe017436951bba4bcb5 --- /dev/null +++ b/chunked/content_aware_chunking/_tflite/chunk_8.txt @@ -0,0 +1,2 @@ +When using CLI, pass the +local_path to the model argument instead of the checkpoint name on 🤗 Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_100.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..06e9e734b9f9cd2b47d827c9365fa1d72df17380 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_100.txt @@ -0,0 +1,3 @@ +"u", followed by "g" would have only been +merged if the probability of "ug" divided by "u", "g" would have been greater than for any other symbol +pair. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_101.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..40a8ab7cbf7cf2c9872a2179d463ad1739662ee0 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_101.txt @@ -0,0 +1,2 @@ +Intuitively, WordPiece is slightly different to BPE in that it evaluates what it loses by merging two symbols +to ensure it's worth it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_102.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b67ba06dc4c388d76f25e96054053b8d752f18c --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_102.txt @@ -0,0 +1,3 @@ +Unigram +Unigram is a subword tokenization algorithm introduced in Subword Regularization: Improving Neural Network Translation +Models with Multiple Subword Candidates (Kudo, 2018). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_103.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..df19c4b81dfba65b9e56fa75bd4b217f55e61bff --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_103.txt @@ -0,0 +1,3 @@ +In contrast to BPE or +WordPiece, Unigram initializes its base vocabulary to a large number of symbols and progressively trims down each +symbol to obtain a smaller vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_104.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7977ca728aab519bcbbc56b861f6ebbaab8d44d --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_104.txt @@ -0,0 +1,2 @@ +The base vocabulary could for instance correspond to all pre-tokenized words and +the most common substrings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_105.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..427c113b72fe58e095fb9492a4a2133c81a98528 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_105.txt @@ -0,0 +1,2 @@ +Unigram is not used directly for any of the models in the transformers, but it's used in +conjunction with SentencePiece. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_106.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c755cf57b7f7bc2c7f9e663147f2e579d9ab506 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_106.txt @@ -0,0 +1,2 @@ +At each training step, the Unigram algorithm defines a loss (often defined as the log-likelihood) over the training +data given the current vocabulary and a unigram language model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_107.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ef60241bc2dbfed249bbe188eac3c1900f1e06b --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_107.txt @@ -0,0 +1,2 @@ +Then, for each symbol in the vocabulary, the algorithm +computes how much the overall loss would increase if the symbol was to be removed from the vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_108.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c2e774c1382476784a9ddd96c39e9b04e5bcec7 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_108.txt @@ -0,0 +1,2 @@ +Unigram then +removes p (with p usually being 10% or 20%) percent of the symbols whose loss increase is the lowest, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_109.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e885c9e42189f61d26a57ad730f601bc3441f45 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_109.txt @@ -0,0 +1,2 @@ +those +symbols that least affect the overall loss over the training data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_110.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..f629eb3155a9ea36b02b08972b99a37432d1a1f5 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_110.txt @@ -0,0 +1,2 @@ +This process is repeated until the vocabulary has +reached the desired size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_111.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ca297b094a76a73d8630e99acf0c20f33dbdfa4 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_111.txt @@ -0,0 +1 @@ +The Unigram algorithm always keeps the base characters so that any word can be tokenized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_112.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ab57d4f0d91c2b21316a3de7fabf4604bbe3bd4 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_112.txt @@ -0,0 +1,2 @@ +Because Unigram is not based on merge rules (in contrast to BPE and WordPiece), the algorithm has several ways of +tokenizing new text after training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_113.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..6930dde7be6c562ca601254a74555a6b709bcfbf --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_113.txt @@ -0,0 +1,3 @@ +As an example, if a trained Unigram tokenizer exhibits the vocabulary: +["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"], +"hugs" could be tokenized both as ["hug", "s"], ["h", "ug", "s"] or ["h", "u", "g", "s"]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_114.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..f78fc5eb5f7a605fca95febb8447af86de2eeb78 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_114.txt @@ -0,0 +1,2 @@ +So which one +to choose? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_115.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..b46af07ee9b69e897ea53a2e2286f3a43fcf8a36 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_115.txt @@ -0,0 +1,2 @@ +Unigram saves the probability of each token in the training corpus on top of saving the vocabulary so that +the probability of each possible tokenization can be computed after training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_116.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..5eb7282174887990671f8ee4c915866fe314130d --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_116.txt @@ -0,0 +1,3 @@ +The algorithm simply picks the most +likely tokenization in practice, but also offers the possibility to sample a possible tokenization according to their +probabilities. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_117.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..5dda1e90f45f459540d8f521dafadface87b9ad4 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_117.txt @@ -0,0 +1 @@ +Those probabilities are defined by the loss the tokenizer is trained on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_118.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f7a18be02d74805113aed8d0adb701d5d029ec4 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_118.txt @@ -0,0 +1,8 @@ +Assuming that the training data consists of +the words \(x_{1}, \dots, x_{N}\) and that the set of all possible tokenizations for a word \(x_{i}\) is +defined as \(S(x_{i})\), then the overall loss is defined as +$$\mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )$$ + +SentencePiece +All tokenization algorithms described so far have the same problem: It is assumed that the input text uses spaces to +separate words. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_119.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e0ed80cc708683863515cf372cc1e3996a78c94 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_119.txt @@ -0,0 +1 @@ +However, not all languages use spaces to separate words. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_120.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b481b1100a789001555adc93b1b8ef35e060d03 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_120.txt @@ -0,0 +1,2 @@ +One possible solution is to use language +specific pre-tokenizers, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_121.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..8688cd6010c2e92004af623d5245366e3dd27106 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_121.txt @@ -0,0 +1 @@ +XLM uses a specific Chinese, Japanese, and Thai pre-tokenizer). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_122.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..eaa3dc9553c8f42585a7ac0802871648e6a40624 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_122.txt @@ -0,0 +1,3 @@ +To solve this problem more generally, SentencePiece: A simple and language independent subword tokenizer and +detokenizer for Neural Text Processing (Kudo et al., 2018) treats the input +as a raw input stream, thus including the space in the set of characters to use. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_123.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..547e17ff21f4c56a15398aaf30090b03587daa19 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_123.txt @@ -0,0 +1,2 @@ +It then uses the BPE or unigram +algorithm to construct the appropriate vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_124.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..54cf1ad10d109154c9c2566b6658d8469e59321a --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_124.txt @@ -0,0 +1,2 @@ +The [XLNetTokenizer] uses SentencePiece for example, which is also why in the example earlier the +"â–" character was included in the vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_125.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d47f5ea9668835c9a578d83bf992f15f8d0b966 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_125.txt @@ -0,0 +1,2 @@ +Decoding with SentencePiece is very easy since all tokens can just be +concatenated and "â–" is replaced by a space. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_126.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf65b16b6bd3accb3df2c606cd242e4e0ff3bfc8 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_126.txt @@ -0,0 +1 @@ +All transformers models in the library that use SentencePiece use it in combination with unigram. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_127.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..72d3dda96ab6d549e5e40c23b2451abf5c335333 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_127.txt @@ -0,0 +1,2 @@ +Examples of models +using SentencePiece are ALBERT, XLNet, Marian, and T5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_39.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2cc44db736e51a9c1014bbd57c772b51bc2aad4 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_39.txt @@ -0,0 +1,3 @@ +So to get the best of +both worlds, transformers models use a hybrid between word-level and character-level tokenization called subword +tokenization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_40.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb5bc00fd0acc1f5d8079ecc7906a639c2a8a0cf --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_40.txt @@ -0,0 +1,4 @@ +Subword tokenization + +Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller +subwords, but rare words should be decomposed into meaningful subwords. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_41.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6f69d0d45a967a04fcf8a187a37303ddeb84ea7 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_41.txt @@ -0,0 +1,2 @@ +For instance "annoyingly" might be +considered a rare word and could be decomposed into "annoying" and "ly". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_42.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..f316d48887317030fce74415e084b3a92a0e0fd7 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_42.txt @@ -0,0 +1,3 @@ +Both "annoying" and "ly" as +stand-alone subwords would appear more frequently while at the same time the meaning of "annoyingly" is kept by the +composite meaning of "annoying" and "ly". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_43.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..8809788ca6ba67be9ea5b1536fe377311bac01d7 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_43.txt @@ -0,0 +1,2 @@ +This is especially useful in agglutinative languages such as Turkish, +where you can form (almost) arbitrarily long complex words by stringing together subwords. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_44.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..277f6c40505850224b59977148fdbcfe209b0722 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_44.txt @@ -0,0 +1,2 @@ +Subword tokenization allows the model to have a reasonable vocabulary size while being able to learn meaningful +context-independent representations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_45.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..37dd35801b6acf3e3c93bab172ce4f3b34ce5d78 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_45.txt @@ -0,0 +1,2 @@ +In addition, subword tokenization enables the model to process words it has never +seen before, by decomposing them into known subwords. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_46.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..a27f65bcb6ff800c98aea0dc71b00446137b862a --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_46.txt @@ -0,0 +1,2 @@ +For instance, the [~transformers.BertTokenizer] tokenizes +"I have a new GPU!" \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_47.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ce2633ca8d189fd5eef2a56b65165b23620dfcd --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_47.txt @@ -0,0 +1,5 @@ +as follows: + +from transformers import BertTokenizer +tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") +tokenizer.tokenize("I have a new GPU!") \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_48.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..156ad0327159581437fdf2c29bc77e6632dcce38 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_48.txt @@ -0,0 +1 @@ +["i", "have", "a", "new", "gp", "##u", "!"] \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_49.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7fc34910dd66cd5cbdb427bc24ffb2091dc3c29 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_49.txt @@ -0,0 +1 @@ +Because we are considering the uncased model, the sentence was lowercased first. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_50.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2db92fe9b4fd454b849f8653770230eff2efce0 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_50.txt @@ -0,0 +1 @@ +We can see that the words ["i", "have", "a", "new"] are present in the tokenizer's vocabulary, but the word "gpu" is not. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_51.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..acac6cb3a948d6fb7de303d6094aa1ca52db6f79 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_51.txt @@ -0,0 +1,2 @@ +Consequently, the +tokenizer splits "gpu" into known subwords: ["gp" and "##u"]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_52.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a673828139e10e16fe5f71808384810619ea8c3 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_52.txt @@ -0,0 +1,2 @@ +"##" means that the rest of the token should +be attached to the previous one, without space (for decoding or reversal of the tokenization). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_53.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9c08fab7bd0355c921ca547722bbc2d26b9ac5c --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_53.txt @@ -0,0 +1,5 @@ +As another example, [~transformers.XLNetTokenizer] tokenizes our previously exemplary text as follows: + +from transformers import XLNetTokenizer +tokenizer = XLNetTokenizer.from_pretrained("xlnet/xlnet-base-cased") +tokenizer.tokenize("Don't you love 🤗 Transformers? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_54.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..41b379816a3ac8cea778003b0c48e3c87d2f801d --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_54.txt @@ -0,0 +1 @@ +We sure do.") \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_55.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..2986a1b6b5d02b76a86bf9fa63aad838a54fa8cf --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_55.txt @@ -0,0 +1 @@ +["â–Don", "'", "t", "â–you", "â–love", "â–", "🤗", "â–", "Transform", "ers", "? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_56.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1d68199b0f53e9a107457cb3a9619feb9f10a60 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_56.txt @@ -0,0 +1 @@ +", "â–We", "â–sure", "â–do", "."] \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_57.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc7a29ee44e1cccad8ffdf5bde8e2ee6ebba2bc1 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_57.txt @@ -0,0 +1 @@ +We'll get back to the meaning of those "â–" when we look at SentencePiece. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_58.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1537ce230dab6d1a5c796337dacaaba110999c8 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_58.txt @@ -0,0 +1,2 @@ +As one can see, +the rare word "Transformers" has been split into the more frequent subwords "Transform" and "ers". \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_59.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..1452aee25905d6868c0104b86be98d603d032a15 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_59.txt @@ -0,0 +1 @@ +Let's now look at how the different subword tokenization algorithms work. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_60.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5377721fad5717cb5be9dafcc18bab53f68e949 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_60.txt @@ -0,0 +1,3 @@ +Note that all of those tokenization +algorithms rely on some form of training which is usually done on the corpus the corresponding model will be trained +on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_61.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f7394616c5c7b481872bd0904bd606b827f0598 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_61.txt @@ -0,0 +1,3 @@ +Byte-Pair Encoding (BPE) +Byte-Pair Encoding (BPE) was introduced in Neural Machine Translation of Rare Words with Subword Units (Sennrich et +al., 2015). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_62.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..9364a4184a9cf0e095bd6a6525a03dbc5c72d0d9 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_62.txt @@ -0,0 +1,2 @@ +BPE relies on a pre-tokenizer that splits the training data into +words. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_63.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8919212407fd5585ea6f2e56a18560df5c6ffdb --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_63.txt @@ -0,0 +1 @@ +Pretokenization can be as simple as space tokenization, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_64.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..7709eb81ccda93b026f55724543e3c81eda9eb91 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_64.txt @@ -0,0 +1 @@ +GPT-2, RoBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_65.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbe2a58b94a8697ce3f487004d33f52856faf453 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_65.txt @@ -0,0 +1 @@ +More advanced pre-tokenization include rule-based tokenization, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_66.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f35ec167f393c6724fa5caf9004ad59e05c7eec --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_66.txt @@ -0,0 +1,3 @@ +XLM, +FlauBERT which uses Moses for most languages, or GPT which uses +spaCy and ftfy, to count the frequency of each word in the training corpus. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_67.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..4448bc74bb4d52a6ec7c05e459dfb479a18053c9 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_67.txt @@ -0,0 +1,2 @@ +After pre-tokenization, a set of unique words has been created and the frequency with which each word occurred in the +training data has been determined. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_68.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b8af9262cf1e50e5800035b5e1988070ff90d8a --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_68.txt @@ -0,0 +1,2 @@ +Next, BPE creates a base vocabulary consisting of all symbols that occur in the set +of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_69.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1493d231675f130f8ec0383b5464909788018ac --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_69.txt @@ -0,0 +1,2 @@ +It does so until +the vocabulary has attained the desired vocabulary size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_70.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0ce1b681748204ee23a45803848bbd3d41c45b5 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_70.txt @@ -0,0 +1,2 @@ +Note that the desired vocabulary size is a hyperparameter to +define before training the tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_71.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ead24359653fa9dc2dbafb810eeaaab35b1ed34 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_71.txt @@ -0,0 +1,4 @@ +As an example, let's assume that after pre-tokenization, the following set of words including their frequency has been +determined: +("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5) +Consequently, the base vocabulary is ["b", "g", "h", "n", "p", "s", "u"]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_72.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..73b9d87475c2fbc2da570619190840f1fef494ac --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_72.txt @@ -0,0 +1,4 @@ +Splitting all words into symbols of the +base vocabulary, we obtain: +("h" "u" "g", 10), ("p" "u" "g", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "u" "g" "s", 5) +BPE then counts the frequency of each possible symbol pair and picks the symbol pair that occurs most frequently. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_73.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..9701232ae0dc01a16f2d71046d2e29954539b276 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_73.txt @@ -0,0 +1,3 @@ +In +the example above "h" followed by "u" is present 10 + 5 = 15 times (10 times in the 10 occurrences of +"hug", 5 times in the 5 occurrences of "hugs"). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_74.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..2400778df845d5472dcb36d07f461e912daaae3e --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_74.txt @@ -0,0 +1,2 @@ +However, the most frequent symbol pair is "u" followed by +"g", occurring 10 + 5 + 5 = 20 times in total. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_75.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..975439a9ba0e3adc32c85e7781cfabaf6e0f9649 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_75.txt @@ -0,0 +1,2 @@ +Thus, the first merge rule the tokenizer learns is to group all +"u" symbols followed by a "g" symbol together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_76.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..45f106c5abae90579063a0a31b4cd93c94781dca --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_76.txt @@ -0,0 +1 @@ +Next, "ug" is added to the vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_77.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e2131e7aba5c55b80c0767a88c5bd9e99e48d12 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_77.txt @@ -0,0 +1,4 @@ +The set of words then +becomes +("h" "ug", 10), ("p" "ug", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "ug" "s", 5) +BPE then identifies the next most common symbol pair. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_78.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..43dc32004e0bc77f1103663796bdcc723db60505 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_78.txt @@ -0,0 +1 @@ +It's "u" followed by "n", which occurs 16 times. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_79.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..71cd63083f696c67353dab3ecc7180e26c90766d --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_79.txt @@ -0,0 +1,2 @@ +"u", +"n" is merged to "un" and added to the vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_80.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..b03a5da1506639d02af4f1482eaffaf15ee33361 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_80.txt @@ -0,0 +1,2 @@ +The next most frequent symbol pair is "h" followed by +"ug", occurring 15 times. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_81.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3c45f2498c3c71592a97e99563842885f9f941d --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_81.txt @@ -0,0 +1 @@ +Again the pair is merged and "hug" can be added to the vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_82.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7cb6e1bfbec9660973637a347207b081ae34926 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_82.txt @@ -0,0 +1,5 @@ +At this stage, the vocabulary is ["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"] and our set of unique words +is represented as +("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5) +Assuming, that the Byte-Pair Encoding training would stop at this point, the learned merge rules would then be applied +to new words (as long as those new words do not include symbols that were not in the base vocabulary). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_83.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..47df9ceb8b59c6dfac9ef75de5caf8ada6ff300b --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_83.txt @@ -0,0 +1,3 @@ +For instance, +the word "bug" would be tokenized to ["b", "ug"] but "mug" would be tokenized as ["", "ug"] since +the symbol "m" is not in the base vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_84.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..184ae47353fbd7ce5a1627aaf37c80b0bdf110db --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_84.txt @@ -0,0 +1,3 @@ +In general, single letters such as "m" are not replaced by the +"" symbol because the training data usually includes at least one occurrence of each letter, but it is likely +to happen for very special characters like emojis. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_85.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..550158158c012dc16e68d7ba2015cd2622f2aee4 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_85.txt @@ -0,0 +1 @@ +As mentioned earlier, the vocabulary size, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_86.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d6733c8e7e8d45eba73cb2c4c8ccb8f4e250e41 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_86.txt @@ -0,0 +1,2 @@ +the base vocabulary size + the number of merges, is a hyperparameter +to choose. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_87.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..42848dbcc72b8f7be49ed194d43a5b9bf9d952ab --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_87.txt @@ -0,0 +1,2 @@ +For instance GPT has a vocabulary size of 40,478 since they have 478 base characters +and chose to stop training after 40,000 merges. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_88.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..19afd8043868bcd7321b01e26cccdd88d027ad07 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_88.txt @@ -0,0 +1,2 @@ +Byte-level BPE +A base vocabulary that includes all possible base characters can be quite large if e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_89.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..28a7d5e4704a23009b968689a978212bad9b9d7f --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_89.txt @@ -0,0 +1,2 @@ +all unicode characters are +considered as base characters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_90.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..56d14cf1c42801d568082c8e9b94d70330548754 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_90.txt @@ -0,0 +1,3 @@ +To have a better base vocabulary, GPT-2 uses bytes +as the base vocabulary, which is a clever trick to force the base vocabulary to be of size 256 while ensuring that +every base character is included in the vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_91.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7aea06dc6f9a940e6c9e9cc51b0b2c4e93375db --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_91.txt @@ -0,0 +1,2 @@ +With some additional rules to deal with punctuation, the GPT2's +tokenizer can tokenize every text without the need for the symbol. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_92.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..47fc1d74006f7411ff02055851beef7afab324e7 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_92.txt @@ -0,0 +1,3 @@ +GPT-2 has a vocabulary +size of 50,257, which corresponds to the 256 bytes base tokens, a special end-of-text token and the symbols learned +with 50,000 merges. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_93.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b0667b8ca11424b49135c22b8ca91e501d854fd --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_93.txt @@ -0,0 +1,2 @@ +WordPiece +WordPiece is the subword tokenization algorithm used for BERT, DistilBERT, and Electra. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_94.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..8048883ccf14d364d4dbb3b947af9522e1690fd7 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_94.txt @@ -0,0 +1,3 @@ +The algorithm was outlined in Japanese and Korean +Voice Search (Schuster et al., 2012) and is very similar to +BPE. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_95.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5bba2b0a01b734d5e4c12ab4755f1d71b7f42b9 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_95.txt @@ -0,0 +1,2 @@ +WordPiece first initializes the vocabulary to include every character present in the training data and +progressively learns a given number of merge rules. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_96.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..319c60ae60a5f78127e9323944c0c92185a5381c --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_96.txt @@ -0,0 +1,2 @@ +In contrast to BPE, WordPiece does not choose the most frequent +symbol pair, but the one that maximizes the likelihood of the training data once added to the vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_97.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b57c77a2c1b79cea19156c2cc6b0d3200ad61a2 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_97.txt @@ -0,0 +1 @@ +So what does this mean exactly? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_98.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..814b9a39ac104b10a2b6c2e4ff0ac7fe593040d8 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_98.txt @@ -0,0 +1,3 @@ +Referring to the previous example, maximizing the likelihood of the training data is +equivalent to finding the symbol pair, whose probability divided by the probabilities of its first symbol followed by +its second symbol is the greatest among all symbol pairs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_tokenizer_summary/chunk_99.txt b/chunked/content_aware_chunking/_tokenizer_summary/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c6c42b6981712ccfb35860e2422a687ff0e1372 --- /dev/null +++ b/chunked/content_aware_chunking/_tokenizer_summary/chunk_99.txt @@ -0,0 +1 @@ +E.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_10.txt b/chunked/content_aware_chunking/_torchscript/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6bb854a617d5351a690fe089e40bc50bb0b68eb --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_10.txt @@ -0,0 +1 @@ +Training would desynchronize the two layers, leading to unexpected results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_11.txt b/chunked/content_aware_chunking/_torchscript/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..23dea4fb1682ce4245cca2b43b4a8cd8bfdf9442 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_11.txt @@ -0,0 +1,2 @@ +This is not the case for models that do not have a language model head, as those do not +have tied weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_12.txt b/chunked/content_aware_chunking/_torchscript/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d8727646048edebd622c5b42fd8bd9a0aa972a4 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_12.txt @@ -0,0 +1 @@ +These models can be safely exported without the torchscript flag. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_13.txt b/chunked/content_aware_chunking/_torchscript/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..a549fcfb53f297125267b5b639d24f238eb10ac3 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_13.txt @@ -0,0 +1,2 @@ +Dummy inputs and standard lengths +The dummy inputs are used for a models forward pass. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_14.txt b/chunked/content_aware_chunking/_torchscript/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf6d4f2a1a37f6cd30db6a86a8a785cdd11f8c84 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_14.txt @@ -0,0 +1,3 @@ +While the inputs' values are +propagated through the layers, PyTorch keeps track of the different operations executed +on each tensor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_15.txt b/chunked/content_aware_chunking/_torchscript/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..30a6e57dd45bf2be46d4f35130ed1cb07069118a --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_15.txt @@ -0,0 +1,2 @@ +These recorded operations are then used to create the trace of the +model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_16.txt b/chunked/content_aware_chunking/_torchscript/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfc84424759bf88163e6a60aa1be586296bd46f6 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_16.txt @@ -0,0 +1 @@ +The trace is created relative to the inputs' dimensions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_17.txt b/chunked/content_aware_chunking/_torchscript/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5d4e46610081729a93488d5edc3f89008456929 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_17.txt @@ -0,0 +1,3 @@ +It is therefore constrained by +the dimensions of the dummy input, and will not work for any other sequence length or +batch size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_18.txt b/chunked/content_aware_chunking/_torchscript/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..917161159a863fd22c78bc557743f6c4f34fa632 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_18.txt @@ -0,0 +1,4 @@ +When trying with a different size, the following error is raised: +`The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2` +We recommended you trace the model with a dummy input size at least as large as the +largest input that will be fed to the model during inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_19.txt b/chunked/content_aware_chunking/_torchscript/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bd4530a81f32c4ddd2629f515d4744be5ae3c0d --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_19.txt @@ -0,0 +1,2 @@ +Padding can help fill the +missing values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_20.txt b/chunked/content_aware_chunking/_torchscript/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..103de39dcc264f669fd157cfb229a71a2587fc41 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_20.txt @@ -0,0 +1,2 @@ +However, since the model is traced with a larger input size, the +dimensions of the matrix will also be large, resulting in more calculations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_21.txt b/chunked/content_aware_chunking/_torchscript/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf13739d36f46d4bcfab61b3defb282984bc58e1 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_21.txt @@ -0,0 +1,2 @@ +Be careful of the total number of operations done on each input and follow the +performance closely when exporting varying sequence-length models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_22.txt b/chunked/content_aware_chunking/_torchscript/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..d05ed3c70b7eada3b8d32aa5fd3ce9a9a2d82ead --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_22.txt @@ -0,0 +1,3 @@ +Using TorchScript in Python +This section demonstrates how to save and load models as well as how to use the trace +for inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_23.txt b/chunked/content_aware_chunking/_torchscript/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..cea8ed3bff6f962d46c6042f9a20d970cb9f7f78 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_23.txt @@ -0,0 +1,9 @@ +Saving a model +To export a BertModel with TorchScript, instantiate BertModel from the BertConfig +class and then save it to disk under the filename traced_bert.pt: +thon +from transformers import BertModel, BertTokenizer, BertConfig +import torch +enc = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") +Tokenizing input text +text = "[CLS] Who was Jim Henson ? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_24.txt b/chunked/content_aware_chunking/_torchscript/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..ede53251e299366a51326f4ae2a40ebe563555d7 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_24.txt @@ -0,0 +1,13 @@ +[SEP] Jim Henson was a puppeteer [SEP]" +tokenized_text = enc.tokenize(text) +Masking one of the input tokens +masked_index = 8 +tokenized_text[masked_index] = "[MASK]" +indexed_tokens = enc.convert_tokens_to_ids(tokenized_text) +segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1] +Creating a dummy input +tokens_tensor = torch.tensor([indexed_tokens]) +segments_tensors = torch.tensor([segments_ids]) +dummy_input = [tokens_tensor, segments_tensors] +Initializing the model with the torchscript flag +Flag set to True even though it is not necessary as this model does not have an LM Head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_25.txt b/chunked/content_aware_chunking/_torchscript/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..b428a0d07d91a8d0510ed37436d1669bf87dd19a --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_25.txt @@ -0,0 +1,33 @@ +config = BertConfig( + vocab_size_or_config_json_file=32000, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + torchscript=True, +) +Instantiating the model +model = BertModel(config) +The model needs to be in evaluation mode +model.eval() +If you are instantiating the model with from_pretrained you can also easily set the TorchScript flag +model = BertModel.from_pretrained("google-bert/bert-base-uncased", torchscript=True) +Creating the trace +traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors]) +torch.jit.save(traced_model, "traced_bert.pt") + +Loading a model +Now you can load the previously saved BertModel, traced_bert.pt, from disk and use +it on the previously initialised dummy_input: +thon +loaded_model = torch.jit.load("traced_bert.pt") +loaded_model.eval() +all_encoder_layers, pooled_output = loaded_model(*dummy_input) + +Using a traced model for inference +Use the traced model for inference by using its __call__ dunder method: +python +traced_model(tokens_tensor, segments_tensors) +Deploy Hugging Face TorchScript models to AWS with the Neuron SDK +AWS introduced the Amazon EC2 Inf1 +instance family for low cost, high performance machine learning inference in the cloud. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_26.txt b/chunked/content_aware_chunking/_torchscript/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..a61cc152b2c10acd77fb72d358e5d6c94b7f1a35 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_26.txt @@ -0,0 +1,2 @@ +The Inf1 instances are powered by the AWS Inferentia chip, a custom-built hardware +accelerator, specializing in deep learning inferencing workloads. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_27.txt b/chunked/content_aware_chunking/_torchscript/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..670c5515a82d92d2f68f34c7d26538125a853889 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_27.txt @@ -0,0 +1,4 @@ +AWS +Neuron is the SDK for +Inferentia that supports tracing and optimizing transformers models for deployment on +Inf1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_28.txt b/chunked/content_aware_chunking/_torchscript/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..fac1620bccd85b796fb49640beaf92d940bdcfbd --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_28.txt @@ -0,0 +1,4 @@ +The Neuron SDK provides: + +Easy-to-use API with one line of code change to trace and optimize a TorchScript + model for inference in the cloud. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_29.txt b/chunked/content_aware_chunking/_torchscript/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..447065816d13a883e23abac4a4a5e25ca11d45c7 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_29.txt @@ -0,0 +1,2 @@ +Out of the box performance optimizations for improved + cost-performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_30.txt b/chunked/content_aware_chunking/_torchscript/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..168513536615de4aa6a367c59f808976e8e17070 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_30.txt @@ -0,0 +1,4 @@ +Support for Hugging Face transformers models built with either + PyTorch + or + TensorFlow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_31.txt b/chunked/content_aware_chunking/_torchscript/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..9229bf10061022e396a92360bec9bd621a38d52b --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_31.txt @@ -0,0 +1,8 @@ +Implications +Transformers models based on the BERT (Bidirectional Encoder Representations from +Transformers) +architecture, or its variants such as +distilBERT and +roBERTa run best on +Inf1 for non-generative tasks such as extractive question answering, sequence +classification, and token classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_32.txt b/chunked/content_aware_chunking/_torchscript/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..891025cc617fb8e911911dee3b038bb5ba333067 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_32.txt @@ -0,0 +1,3 @@ +However, text generation tasks can still be +adapted to run on Inf1 according to this AWS Neuron MarianMT +tutorial. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_33.txt b/chunked/content_aware_chunking/_torchscript/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..f748bd4d0dc23f24d94bf44e533af9290ab8c7a5 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_33.txt @@ -0,0 +1,4 @@ +More information about models that can be converted out of the box on Inferentia can be +found in the Model Architecture +Fit +section of the Neuron documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_34.txt b/chunked/content_aware_chunking/_torchscript/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..df8fc7371912ac4df4d04f0e465c603ac8834998 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_34.txt @@ -0,0 +1,5 @@ +Dependencies +Using AWS Neuron to convert models requires a Neuron SDK +environment +which comes preconfigured on AWS Deep Learning +AMI. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_35.txt b/chunked/content_aware_chunking/_torchscript/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3929bf4bbcbee7f8784f5d49a0af3c42f8efee7 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_35.txt @@ -0,0 +1,3 @@ +Converting a model for AWS Neuron +Convert a model for AWS NEURON using the same code from Using TorchScript in +Python to trace a BertModel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_36.txt b/chunked/content_aware_chunking/_torchscript/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..09c9a13261129756749af5718de194653764c270 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_36.txt @@ -0,0 +1,12 @@ +Import the +torch.neuron framework extension to access the components of the Neuron SDK through a +Python API: +python +from transformers import BertModel, BertTokenizer, BertConfig +import torch +import torch.neuron +You only need to modify the following line: +diff +- torch.jit.trace(model, [tokens_tensor, segments_tensors]) ++ torch.neuron.trace(model, [token_tensor, segments_tensors]) +This enables the Neuron SDK to trace the model and optimize it for Inf1 instances. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_37.txt b/chunked/content_aware_chunking/_torchscript/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b270e7d6785ab5bd32ba37282191972a0d82dd5 --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_37.txt @@ -0,0 +1,3 @@ +To learn more about AWS Neuron SDK features, tools, example tutorials and latest +updates, please see the AWS NeuronSDK +documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_torchscript/chunk_9.txt b/chunked/content_aware_chunking/_torchscript/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a5411e8971505e832f7adf9bc42e041763daf7e --- /dev/null +++ b/chunked/content_aware_chunking/_torchscript/chunk_9.txt @@ -0,0 +1,2 @@ +Models instantiated with the torchscript flag have their Embedding layer and +Decoding layer separated, which means that they should not be trained down the line. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_20.txt b/chunked/content_aware_chunking/_trainer/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb7f0b68750c7176b832c8f709bd4468cfc6480c --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_20.txt @@ -0,0 +1 @@ +You'll find the checkpoints saved in a checkpoint-000 subfolder where the numbers at the end correspond to the training step. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_21.txt b/chunked/content_aware_chunking/_trainer/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee754e21be11adde6b4d79f1479a793d78ea87c2 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_21.txt @@ -0,0 +1 @@ +Saving checkpoints are useful for resuming training later. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_22.txt b/chunked/content_aware_chunking/_trainer/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..a72cb1a35072fb4ebe3a02b708396fca335e9398 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_22.txt @@ -0,0 +1,6 @@ +resume from latest checkpoint +trainer.train(resume_from_checkpoint=True) +resume from specific checkpoint saved in output directory +trainer.train(resume_from_checkpoint="your-model/checkpoint-1000") + +You can save your checkpoints (the optimizer state is not saved by default) to the Hub by setting push_to_hub=True in [TrainingArguments] to commit and push them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_23.txt b/chunked/content_aware_chunking/_trainer/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..16d2bd50c07b6529c520c51adf210f1503547eba --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_23.txt @@ -0,0 +1,6 @@ +Other options for deciding how your checkpoints are saved are set up in the hub_strategy parameter: + +hub_strategy="checkpoint" pushes the latest checkpoint to a subfolder named "last-checkpoint" from which you can resume training +hug_strategy="all_checkpoints" pushes all checkpoints to the directory defined in output_dir (you'll see one checkpoint per folder in your model repository) + +When you resume training from a checkpoint, the [Trainer] tries to keep the Python, NumPy, and PyTorch RNG states the same as they were when the checkpoint was saved. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_24.txt b/chunked/content_aware_chunking/_trainer/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..99c3f0d4d157081991484d0a0397f1d019016858 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_24.txt @@ -0,0 +1 @@ +But because PyTorch has various non-deterministic default settings, the RNG states aren't guaranteed to be the same. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_25.txt b/chunked/content_aware_chunking/_trainer/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..068925a3b992c77ad0ad45999f03fcb352b46469 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_25.txt @@ -0,0 +1 @@ +If you want to enable full determinism, take a look at the Controlling sources of randomness guide to learn what you can enable to make your training fully deterministic. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_26.txt b/chunked/content_aware_chunking/_trainer/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..68baba2660a6d6ea937b2ffc5e5c6082d883f086 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_26.txt @@ -0,0 +1 @@ +Keep in mind though that by making certain settings deterministic, training may be slower. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_27.txt b/chunked/content_aware_chunking/_trainer/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..56402d845cb7de7c72d935ccbfc68070c094cd01 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_27.txt @@ -0,0 +1,2 @@ +Customize the Trainer +While the [Trainer] class is designed to be accessible and easy-to-use, it also offers a lot of customizability for more adventurous users. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_28.txt b/chunked/content_aware_chunking/_trainer/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e72c8e99083ef4737f53e04a31c156b3c9f87ae --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_28.txt @@ -0,0 +1 @@ +Many of the [Trainer]'s method can be subclassed and overridden to support the functionality you want, without having to rewrite the entire training loop from scratch to accommodate it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_29.txt b/chunked/content_aware_chunking/_trainer/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..47536921ad11dd142f97c0cf8a8251bb0e702f32 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_29.txt @@ -0,0 +1,14 @@ +These methods include: + +[~Trainer.get_train_dataloader] creates a training DataLoader +[~Trainer.get_eval_dataloader] creates an evaluation DataLoader +[~Trainer.get_test_dataloader] creates a test DataLoader +[~Trainer.log] logs information on the various objects that watch training +[~Trainer.create_optimizer_and_scheduler] creates an optimizer and learning rate scheduler if they weren't passed in the __init__; these can also be separately customized with [~Trainer.create_optimizer] and [~Trainer.create_scheduler] respectively +[~Trainer.compute_loss] computes the loss on a batch of training inputs +[~Trainer.training_step] performs the training step +[~Trainer.prediction_step] performs the prediction and test step +[~Trainer.evaluate] evaluates the model and returns the evaluation metrics +[~Trainer.predict] makes predictions (with metrics if labels are available) on the test set + +For example, if you want to customize the [~Trainer.compute_loss] method to use a weighted loss instead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_30.txt b/chunked/content_aware_chunking/_trainer/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..430bd5cfb7859ddbe640510eed354374586f5587 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_30.txt @@ -0,0 +1,15 @@ +from torch import nn +from transformers import Trainer +class CustomTrainer(Trainer): + def compute_loss(self, model, inputs, return_outputs=False): + labels = inputs.pop("labels") + # forward pass + outputs = model(**inputs) + logits = outputs.get("logits") + # compute custom loss for 3 labels with different weights + loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device)) + loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1)) + return (loss, outputs) if return_outputs else loss + +Callbacks +Another option for customizing the [Trainer] is to use callbacks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_31.txt b/chunked/content_aware_chunking/_trainer/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b87a1a5041bfa36f7f1c41b04cad2638a9149d8 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_31.txt @@ -0,0 +1 @@ +Callbacks don't change anything in the training loop. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_32.txt b/chunked/content_aware_chunking/_trainer/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..74e218a90c6fb31f169bd87fac4bfeca23d5d2c1 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_32.txt @@ -0,0 +1 @@ +They inspect the training loop state and then execute some action (early stopping, logging results, etc.) \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_33.txt b/chunked/content_aware_chunking/_trainer/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..280b8452b218e04f36f49e86ed837fb5c3e252c9 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_33.txt @@ -0,0 +1 @@ +depending on the state. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_34.txt b/chunked/content_aware_chunking/_trainer/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad4416180bcccf0bf96f4329cd248f21450585d4 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_34.txt @@ -0,0 +1 @@ +In other words, a callback can't be used to implement something like a custom loss function and you'll need to subclass and override the [~Trainer.compute_loss] method for that. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_35.txt b/chunked/content_aware_chunking/_trainer/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fad1891b7d57c7afdca8d75828328a599137e99 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_35.txt @@ -0,0 +1 @@ +For example, if you want to add an early stopping callback to the training loop after 10 steps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_36.txt b/chunked/content_aware_chunking/_trainer/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c94363c824f0322a6ce2f8af8147bcf2154e5ce --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_36.txt @@ -0,0 +1,11 @@ +from transformers import TrainerCallback +class EarlyStoppingCallback(TrainerCallback): + def init(self, num_steps=10): + self.num_steps = num_steps +def on_step_end(self, args, state, control, **kwargs): + if state.global_step >= self.num_steps: + return {"should_training_stop": True} + else: + return {} + +Then pass it to the [Trainer]'s callback parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_37.txt b/chunked/content_aware_chunking/_trainer/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbd32f29dc185ec0f640313a52382b2622ea2eef --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_37.txt @@ -0,0 +1,15 @@ +from transformers import Trainer +trainer = Trainer( + model=model, + args=training_args, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + callback=[EarlyStoppingCallback()], +) + +Logging + +Check out the logging API reference for more information about the different logging levels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_38.txt b/chunked/content_aware_chunking/_trainer/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..efddc66f378dac37e482505f46473c289011e23c --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_38.txt @@ -0,0 +1 @@ +The [Trainer] is set to logging.INFO by default which reports errors, warnings, and other basic information. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_39.txt b/chunked/content_aware_chunking/_trainer/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..5589ca2e1b3e486b1a6be80f27422033c3de2a2a --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_39.txt @@ -0,0 +1 @@ +A [Trainer] replica - in distributed environments - is set to logging.WARNING which only reports errors and warnings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_40.txt b/chunked/content_aware_chunking/_trainer/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..f40da0ba14b3224827ab6449ddf0eddb8009636b --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_40.txt @@ -0,0 +1 @@ +You can change the logging level with the log_level and log_level_replica parameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_41.txt b/chunked/content_aware_chunking/_trainer/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cae39c647da53de82159d7016506d19e6b383a5 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_41.txt @@ -0,0 +1 @@ +To configure the log level setting for each node, use the log_on_each_node parameter to determine whether to use the log level on each node or only on the main node. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_42.txt b/chunked/content_aware_chunking/_trainer/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..343675f26938684e4f47e3419944aacd24039678 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_42.txt @@ -0,0 +1 @@ +[Trainer] sets the log level separately for each node in the [Trainer.__init__] method, so you may want to consider setting this sooner if you're using other Transformers functionalities before creating the [Trainer] object. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_43.txt b/chunked/content_aware_chunking/_trainer/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a90527715c640c9e6207de83288732fa6d5c0a5 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_43.txt @@ -0,0 +1,15 @@ +For example, to set your main code and modules to use the same log level according to each node: + +logger = logging.getLogger(name) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + handlers=[logging.StreamHandler(sys.stdout)], +) +log_level = training_args.get_process_log_level() +logger.setLevel(log_level) +datasets.utils.logging.set_verbosity(log_level) +transformers.utils.logging.set_verbosity(log_level) +trainer = Trainer() + +Use different combinations of log_level and log_level_replica to configure what gets logged on each of the nodes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_44.txt b/chunked/content_aware_chunking/_trainer/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..03a90abf28ec9605491eac9dd142927e109c4e60 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_44.txt @@ -0,0 +1,3 @@ +my_app.py --log_level warning --log_level_replica error + +Add the log_on_each_node 0 parameter for multi-node environments. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_45.txt b/chunked/content_aware_chunking/_trainer/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..4df2d57de43a0acf7ec632eece8478918be15df2 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_45.txt @@ -0,0 +1,7 @@ +```bash +my_app.py --log_level warning --log_level_replica error --log_on_each_node 0 +set to only report errors +my_app.py --log_level error --log_level_replica error --log_on_each_node 0 + +NEFTune +NEFTune is a technique that can improve performance by adding noise to the embedding vectors during training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_46.txt b/chunked/content_aware_chunking/_trainer/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4ffb3fd87c709ad324054a977651474005747f5 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_46.txt @@ -0,0 +1 @@ +To enable it in [Trainer], set the neftune_noise_alpha parameter in [TrainingArguments] to control how much noise is added. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_47.txt b/chunked/content_aware_chunking/_trainer/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a8fe79d97596cfa3443aa01762730bf04e97e75 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_47.txt @@ -0,0 +1,5 @@ +from transformers import TrainingArguments, Trainer +training_args = TrainingArguments(, neftune_noise_alpha=0.1) +trainer = Trainer(, args=training_args) + +NEFTune is disabled after training to restore the original embedding layer to avoid any unexpected behavior. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_48.txt b/chunked/content_aware_chunking/_trainer/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..5069da61f582cea3f4eb4761fadca6674bf6ebc6 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_48.txt @@ -0,0 +1,2 @@ +Accelerate and Trainer +The [Trainer] class is powered by Accelerate, a library for easily training PyTorch models in distributed environments with support for integrations such as FullyShardedDataParallel (FSDP) and DeepSpeed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_49.txt b/chunked/content_aware_chunking/_trainer/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee46ac0fdc3850ec07428da560cb1fc224a6bb75 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_49.txt @@ -0,0 +1 @@ +Learn more about FSDP sharding strategies, CPU offloading, and more with the [Trainer] in the Fully Sharded Data Parallel guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_50.txt b/chunked/content_aware_chunking/_trainer/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..17d99de9e35ff019bb618e82de3fb67b8624efb1 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_50.txt @@ -0,0 +1 @@ +To use Accelerate with [Trainer], run the accelerate.config command to set up training for your training environment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_51.txt b/chunked/content_aware_chunking/_trainer/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..76ee7949c80ab40e452b910d2643b24ce5d7d737 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_51.txt @@ -0,0 +1 @@ +This command creates a config_file.yaml that'll be used when you launch your training script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_52.txt b/chunked/content_aware_chunking/_trainer/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfeaa7799180157b34baca5b781e8fd58d5c1d63 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_52.txt @@ -0,0 +1,89 @@ +For example, some example configurations you can setup are: + +yml +compute_environment: LOCAL_MACHINE +distributed_type: MULTI_GPU +downcast_bf16: 'no' +gpu_ids: all +machine_rank: 0 #change rank as per the node +main_process_ip: 192.168.20.1 +main_process_port: 9898 +main_training_function: main +mixed_precision: fp16 +num_machines: 2 +num_processes: 8 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false + +yml +compute_environment: LOCAL_MACHINE +distributed_type: FSDP +downcast_bf16: 'no' +fsdp_config: + fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP + fsdp_backward_prefetch_policy: BACKWARD_PRE + fsdp_forward_prefetch: true + fsdp_offload_params: false + fsdp_sharding_strategy: 1 + fsdp_state_dict_type: FULL_STATE_DICT + fsdp_sync_module_states: true + fsdp_transformer_layer_cls_to_wrap: BertLayer + fsdp_use_orig_params: true +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 2 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false + +yml +compute_environment: LOCAL_MACHINE +deepspeed_config: + deepspeed_config_file: /home/user/configs/ds_zero3_config.json + zero3_init_flag: true +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +num_machines: 1 +num_processes: 4 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false + +yml +compute_environment: LOCAL_MACHINE +deepspeed_config: + gradient_accumulation_steps: 1 + gradient_clipping: 0.7 + offload_optimizer_device: cpu + offload_param_device: cpu + zero3_init_flag: true + zero_stage: 2 +distributed_type: DEEPSPEED +downcast_bf16: 'no' +machine_rank: 0 +main_training_function: main +mixed_precision: bf16 +num_machines: 1 +num_processes: 4 +rdzv_backend: static +same_network: true +tpu_env: [] +tpu_use_cluster: false +tpu_use_sudo: false +use_cpu: false + +The accelerate_launch command is the recommended way to launch your training script on a distributed system with Accelerate and [Trainer] with the parameters specified in config_file.yaml. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_53.txt b/chunked/content_aware_chunking/_trainer/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..51ca492a7c5d1d6d63fee346a66184a1e0050483 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_53.txt @@ -0,0 +1 @@ +This file is saved to the Accelerate cache folder and automatically loaded when you run accelerate_launch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_trainer/chunk_54.txt b/chunked/content_aware_chunking/_trainer/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..8881873d4703e60a97ef54a63fb4a4b4b3216b47 --- /dev/null +++ b/chunked/content_aware_chunking/_trainer/chunk_54.txt @@ -0,0 +1,35 @@ +For example, to run the run_glue.py training script with the FSDP configuration: + +accelerate launch \ + ./examples/pytorch/text-classification/run_glue.py \ + --model_name_or_path google-bert/bert-base-cased \ + --task_name $TASK_NAME \ + --do_train \ + --do_eval \ + --max_seq_length 128 \ + --per_device_train_batch_size 16 \ + --learning_rate 5e-5 \ + --num_train_epochs 3 \ + --output_dir /tmp/$TASK_NAME/ \ + --overwrite_output_dir +You could also specify the parameters from the config_file.yaml file directly in the command line: + +accelerate launch --num_processes=2 \ + --use_fsdp \ + --mixed_precision=bf16 \ + --fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP \ + --fsdp_transformer_layer_cls_to_wrap="BertLayer" \ + --fsdp_sharding_strategy=1 \ + --fsdp_state_dict_type=FULL_STATE_DICT \ + ./examples/pytorch/text-classification/run_glue.py + --model_name_or_path google-bert/bert-base-cased \ + --task_name $TASK_NAME \ + --do_train \ + --do_eval \ + --max_seq_length 128 \ + --per_device_train_batch_size 16 \ + --learning_rate 5e-5 \ + --num_train_epochs 3 \ + --output_dir /tmp/$TASK_NAME/ \ + --overwrite_output_dir +Check out the Launching your Accelerate scripts tutorial to learn more about accelerate_launch and custom configurations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_27.txt b/chunked/content_aware_chunking/_training/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..85afbc328687f1caf4a972fd9909f298ac5b68c7 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_27.txt @@ -0,0 +1 @@ +The [Trainer] API supports a wide range of training options and features such as logging, gradient accumulation, and mixed precision. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_28.txt b/chunked/content_aware_chunking/_training/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..089e61a8def7104fc238fbc3b71458a358c2f752 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_28.txt @@ -0,0 +1 @@ +Start by loading your model and specify the number of expected labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_29.txt b/chunked/content_aware_chunking/_training/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..1cae0a69a6faa3535521889efee64bec075367b5 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_29.txt @@ -0,0 +1,7 @@ +From the Yelp Review dataset card, you know there are five labels: + +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5) + +You will see a warning about some of the pretrained weights not being used and some weights being randomly +initialized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_30.txt b/chunked/content_aware_chunking/_training/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..60c0299f08c5e0d3a6f7b35f52b35089ef54f469 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_30.txt @@ -0,0 +1 @@ +Don't worry, this is completely normal! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_31.txt b/chunked/content_aware_chunking/_training/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..a35f68565e4463713f131bc368a688d1d97a2f2c --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_31.txt @@ -0,0 +1 @@ +The pretrained head of the BERT model is discarded, and replaced with a randomly initialized classification head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_32.txt b/chunked/content_aware_chunking/_training/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..887445380de75a34455ae1953bb2d6ec425e6d10 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_32.txt @@ -0,0 +1 @@ +You will fine-tune this new model head on your sequence classification task, transferring the knowledge of the pretrained model to it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_33.txt b/chunked/content_aware_chunking/_training/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..e278a881c44969da7d2a5a9e2ac8d27a6556836f --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_33.txt @@ -0,0 +1,2 @@ +Training hyperparameters +Next, create a [TrainingArguments] class which contains all the hyperparameters you can tune as well as flags for activating different training options. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_34.txt b/chunked/content_aware_chunking/_training/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..caaaf5a574a4a4fc1a231468953588aa109460a2 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_34.txt @@ -0,0 +1 @@ +For this tutorial you can start with the default training hyperparameters, but feel free to experiment with these to find your optimal settings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_35.txt b/chunked/content_aware_chunking/_training/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..3abc1c6f0234a8d1b0a93a97237b33c6126bc58a --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_35.txt @@ -0,0 +1,7 @@ +Specify where to save the checkpoints from your training: + +from transformers import TrainingArguments +training_args = TrainingArguments(output_dir="test_trainer") + +Evaluate +[Trainer] does not automatically evaluate model performance during training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_36.txt b/chunked/content_aware_chunking/_training/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..945cd322fec21acb783040e928a4f06b4f32c0f8 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_36.txt @@ -0,0 +1 @@ +You'll need to pass [Trainer] a function to compute and report metrics. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_37.txt b/chunked/content_aware_chunking/_training/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..59ed2f64599734c34a32f7900e2d62891b2cf5fb --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_37.txt @@ -0,0 +1,7 @@ +The 🤗 Evaluate library provides a simple accuracy function you can load with the [evaluate.load] (see this quicktour for more information) function: + +import numpy as np +import evaluate +metric = evaluate.load("accuracy") + +Call [~evaluate.compute] on metric to calculate the accuracy of your predictions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_38.txt b/chunked/content_aware_chunking/_training/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..1485f0e5fd663c2ca1f5277619b55ec4ff9ed473 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_38.txt @@ -0,0 +1,29 @@ +Before passing your predictions to compute, you need to convert the logits to predictions (remember all 🤗 Transformers models return logits): + +def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + +If you'd like to monitor your evaluation metrics during fine-tuning, specify the evaluation_strategy parameter in your training arguments to report the evaluation metric at the end of each epoch: + +from transformers import TrainingArguments, Trainer +training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch") + +Trainer +Create a [Trainer] object with your model, training arguments, training and test datasets, and evaluation function: + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=small_train_dataset, + eval_dataset=small_eval_dataset, + compute_metrics=compute_metrics, + ) + +Then fine-tune your model by calling [~transformers.Trainer.train]: + +trainer.train() + +Train a TensorFlow model with Keras +You can also train 🤗 Transformers models in TensorFlow with the Keras API! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_39.txt b/chunked/content_aware_chunking/_training/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..2927833cce4e4086108fad0e1f5f7ede90f1af27 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_39.txt @@ -0,0 +1,3 @@ +Loading data for Keras +When you want to train a 🤗 Transformers model with the Keras API, you need to convert your dataset to a format that +Keras understands. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_40.txt b/chunked/content_aware_chunking/_training/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bc3626d7bc46d324aa7129afe37d09a51f56f6e --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_40.txt @@ -0,0 +1 @@ +If your dataset is small, you can just convert the whole thing to NumPy arrays and pass it to Keras. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_41.txt b/chunked/content_aware_chunking/_training/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c4e77fb0ff96e6c50f341d97c1d6a7555a59ae1 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_41.txt @@ -0,0 +1 @@ +Let's try that first before we do anything more complicated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_42.txt b/chunked/content_aware_chunking/_training/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..35973bb037b4cf4f8c25582467ef7491182b62b2 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_42.txt @@ -0,0 +1 @@ +First, load a dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_43.txt b/chunked/content_aware_chunking/_training/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbf4624b396596f43dd4d5fc84ee4e5fca894793 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_43.txt @@ -0,0 +1,2 @@ +We'll use the CoLA dataset from the GLUE benchmark, +since it's a simple binary text classification task, and just take the training split for now. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_44.txt b/chunked/content_aware_chunking/_training/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fc0f8210a81be2231f917a7569651a710e2d1af --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_44.txt @@ -0,0 +1,5 @@ +from datasets import load_dataset +dataset = load_dataset("glue", "cola") +dataset = dataset["train"] # Just take the training split for now + +Next, load a tokenizer and tokenize the data as NumPy arrays. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_45.txt b/chunked/content_aware_chunking/_training/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d02d4d2a3f3e5b9cd07fe8fcf40c973678554a7 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_45.txt @@ -0,0 +1,2 @@ +Note that the labels are already a list of 0 and 1s, +so we can just convert that directly to a NumPy array without tokenization! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_46.txt b/chunked/content_aware_chunking/_training/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c2823ed585224f3ba7fc094f64b44dd088c14a1 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_46.txt @@ -0,0 +1,8 @@ +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") +tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True) +Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras +tokenized_data = dict(tokenized_data) +labels = np.array(dataset["label"]) # Label is already an array of 0 and 1 + +Finally, load, compile, and fit the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_47.txt b/chunked/content_aware_chunking/_training/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c1036577916802b0059a99571a1e06c0b15f1f4 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_47.txt @@ -0,0 +1,8 @@ +Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +from transformers import TFAutoModelForSequenceClassification +from tensorflow.keras.optimizers import Adam +Load and compile our model +model = TFAutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased") +Lower learning rates are often better for fine-tuning transformers +model.compile(optimizer=Adam(3e-5)) # No loss argument! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_48.txt b/chunked/content_aware_chunking/_training/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b278269fb8aa22590e965cddbc9d052f51484d3 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_48.txt @@ -0,0 +1,3 @@ +model.fit(tokenized_data, labels) + +You don't have to pass a loss argument to your models when you compile() them! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_49.txt b/chunked/content_aware_chunking/_training/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..6710d7db807dc9e3cf1414feccdf737445163846 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_49.txt @@ -0,0 +1,2 @@ +Hugging Face models automatically +choose a loss that is appropriate for their task and model architecture if this argument is left blank. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_50.txt b/chunked/content_aware_chunking/_training/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..92045e9d550c573010ec7d9f828cf20f4a0607bb --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_50.txt @@ -0,0 +1,2 @@ +You can always +override this by specifying a loss yourself if you want to! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_51.txt b/chunked/content_aware_chunking/_training/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..7aef680998a21570f2eb71381f087dfe3b20311c --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_51.txt @@ -0,0 +1 @@ +This approach works great for smaller datasets, but for larger datasets, you might find it starts to become a problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_52.txt b/chunked/content_aware_chunking/_training/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9e07966c04eee20748a971960433112b34835f9 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_52.txt @@ -0,0 +1 @@ +Why? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_53.txt b/chunked/content_aware_chunking/_training/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..e757bec7f3c6923af72522e87cf98525864d30f8 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_53.txt @@ -0,0 +1,3 @@ +Because the tokenized array and labels would have to be fully loaded into memory, and because NumPy doesn’t handle +“jagged†arrays, so every tokenized sample would have to be padded to the length of the longest sample in the whole +dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_54.txt b/chunked/content_aware_chunking/_training/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1a79885c28d07d7e74b788c7a91cd5634fe3baa --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_54.txt @@ -0,0 +1 @@ +That’s going to make your array even bigger, and all those padding tokens will slow down training too! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_55.txt b/chunked/content_aware_chunking/_training/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..b341d441d5c8416a0c24e3dc72e5be3a931d7a97 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_55.txt @@ -0,0 +1,2 @@ +Loading data as a tf.data.Dataset +If you want to avoid slowing down training, you can load your data as a tf.data.Dataset instead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_56.txt b/chunked/content_aware_chunking/_training/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..96bffc3568e356a8c0ecdca9a8a3a9fa900565f2 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_56.txt @@ -0,0 +1,4 @@ +Although you can write your own +tf.data pipeline if you want, we have two convenience methods for doing this: + +[~TFPreTrainedModel.prepare_tf_dataset]: This is the method we recommend in most cases. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_57.txt b/chunked/content_aware_chunking/_training/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..61b99d41fd242d1d8eccec62e265a7aedb178857 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_57.txt @@ -0,0 +1,3 @@ +Because it is a method +on your model, it can inspect the model to automatically figure out which columns are usable as model inputs, and +discard the others to make a simpler, more performant dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_58.txt b/chunked/content_aware_chunking/_training/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e03d25957ae64d241c02deefcbd6ba8a27c5993 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_58.txt @@ -0,0 +1,2 @@ +[~datasets.Dataset.to_tf_dataset]: This method is more low-level, and is useful when you want to exactly control how +your dataset is created, by specifying exactly which columns and label_cols to include. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_59.txt b/chunked/content_aware_chunking/_training/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..636feb96177655dd7cc8682738eb0c9beea59223 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_59.txt @@ -0,0 +1,9 @@ +Before you can use [~TFPreTrainedModel.prepare_tf_dataset], you will need to add the tokenizer outputs to your dataset as columns, as shown in +the following code sample: + +def tokenize_dataset(data): + # Keys of the returned dictionary will be added to the dataset as columns + return tokenizer(data["text"]) +dataset = dataset.map(tokenize_dataset) + +Remember that Hugging Face datasets are stored on disk by default, so this will not inflate your memory usage! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_60.txt b/chunked/content_aware_chunking/_training/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..4477063417b482ca34bb95c14db52ff67bc82717 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_60.txt @@ -0,0 +1,3 @@ +Once the +columns have been added, you can stream batches from the dataset and add padding to each batch, which greatly +reduces the number of padding tokens compared to padding the entire dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_61.txt b/chunked/content_aware_chunking/_training/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..416a4175c6a2042e5036a67fcc86592daaf73390 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_61.txt @@ -0,0 +1,3 @@ +tf_dataset = model.prepare_tf_dataset(dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer) + +Note that in the code sample above, you need to pass the tokenizer to prepare_tf_dataset so it can correctly pad batches as they're loaded. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_62.txt b/chunked/content_aware_chunking/_training/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7310e7fb5826c140cd6b5e3b7b1eac656f17361 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_62.txt @@ -0,0 +1 @@ +If all the samples in your dataset are the same length and no padding is necessary, you can skip this argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_63.txt b/chunked/content_aware_chunking/_training/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..28dd6d0c422b8d9a4388d8a7313c24c96eb8ad0d --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_63.txt @@ -0,0 +1 @@ +If you need to do something more complex than just padding samples (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_64.txt b/chunked/content_aware_chunking/_training/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..5615a4ae1da778ac200c8461920f20993750791f --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_64.txt @@ -0,0 +1,3 @@ +corrupting tokens for masked language +modelling), you can use the collate_fn argument instead to pass a function that will be called to transform the +list of samples into a batch and apply any preprocessing you want. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_65.txt b/chunked/content_aware_chunking/_training/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..e80806caf68e5bfdf50150687137b246aa2ae663 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_65.txt @@ -0,0 +1,3 @@ +See our +examples or +notebooks to see this approach in action. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_66.txt b/chunked/content_aware_chunking/_training/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..612dbda32610ca6cb7f2215d523f5d7199456ea2 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_66.txt @@ -0,0 +1,3 @@ +Once you've created a tf.data.Dataset, you can compile and fit the model as before: + +model.compile(optimizer=Adam(3e-5)) # No loss argument! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_67.txt b/chunked/content_aware_chunking/_training/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1a996272ff33c405b16de1fdda42ba44ae14f30 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_67.txt @@ -0,0 +1,5 @@ +model.fit(tf_dataset) + +Train in native PyTorch + +[Trainer] takes care of the training loop and allows you to fine-tune a model in a single line of code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_68.txt b/chunked/content_aware_chunking/_training/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..68f8f90aeb8ea3d018d01040b18ef4d9ac690f30 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_68.txt @@ -0,0 +1 @@ +For users who prefer to write their own training loop, you can also fine-tune a 🤗 Transformers model in native PyTorch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_69.txt b/chunked/content_aware_chunking/_training/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fd298462a3632b46c15f65c4cb2d5004934de62 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_69.txt @@ -0,0 +1,6 @@ +At this point, you may need to restart your notebook or execute the following code to free some memory: +py +del model +del trainer +torch.cuda.empty_cache() +Next, manually postprocess tokenized_dataset to prepare it for training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_70.txt b/chunked/content_aware_chunking/_training/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..185fa01ea10579a73ae4b6ffae25ae769bf5e46e --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_70.txt @@ -0,0 +1,31 @@ +Remove the text column because the model does not accept raw text as an input: + +tokenized_datasets = tokenized_datasets.remove_columns(["text"]) + +Rename the label column to labels because the model expects the argument to be named labels: + +tokenized_datasets = tokenized_datasets.rename_column("label", "labels") + +Set the format of the dataset to return PyTorch tensors instead of lists: + +tokenized_datasets.set_format("torch") + +Then create a smaller subset of the dataset as previously shown to speed up the fine-tuning: + +small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000)) +small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000)) + +DataLoader +Create a DataLoader for your training and test datasets so you can iterate over batches of data: + +from torch.utils.data import DataLoader +train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8) +eval_dataloader = DataLoader(small_eval_dataset, batch_size=8) + +Load your model with the number of expected labels: + +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-cased", num_labels=5) + +Optimizer and learning rate scheduler +Create an optimizer and learning rate scheduler to fine-tune the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_71.txt b/chunked/content_aware_chunking/_training/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..17640ad0a32b650e7338eaca961623a7c333a49d --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_71.txt @@ -0,0 +1,15 @@ +Let's use the AdamW optimizer from PyTorch: + +from torch.optim import AdamW +optimizer = AdamW(model.parameters(), lr=5e-5) + +Create the default learning rate scheduler from [Trainer]: + +from transformers import get_scheduler +num_epochs = 3 +num_training_steps = num_epochs * len(train_dataloader) +lr_scheduler = get_scheduler( + name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps + ) + +Lastly, specify device to use a GPU if you have access to one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_72.txt b/chunked/content_aware_chunking/_training/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b576f5790f5dcfd093f5680e59091424b9d5a43 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_72.txt @@ -0,0 +1 @@ +Otherwise, training on a CPU may take several hours instead of a couple of minutes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_73.txt b/chunked/content_aware_chunking/_training/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd6a0d08b7a2cd65fbf9bf192a9fa167e4535401 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_73.txt @@ -0,0 +1,5 @@ +import torch +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") +model.to(device) + +Get free access to a cloud GPU if you don't have one with a hosted notebook like Colaboratory or SageMaker StudioLab. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_74.txt b/chunked/content_aware_chunking/_training/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..95106c70cadd557f7d9cbbcca27b2eff85f2161c --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_74.txt @@ -0,0 +1 @@ +Great, now you are ready to train! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_75.txt b/chunked/content_aware_chunking/_training/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..501e4c0065fd5c4e8d00cf87b400645ee4ee0e90 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_75.txt @@ -0,0 +1,21 @@ +🥳 +Training loop +To keep track of your training progress, use the tqdm library to add a progress bar over the number of training steps: + +from tqdm.auto import tqdm +progress_bar = tqdm(range(num_training_steps)) +model.train() +for epoch in range(num_epochs): + for batch in train_dataloader: + batch = {k: v.to(device) for k, v in batch.items()} + outputs = model(**batch) + loss = outputs.loss + loss.backward() + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) + +Evaluate +Just like how you added an evaluation function to [Trainer], you need to do the same when you write your own training loop. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_76.txt b/chunked/content_aware_chunking/_training/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdd2ee0d2e5e70e15de9cb78d4a219b9faedc2ad --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_76.txt @@ -0,0 +1 @@ +But instead of calculating and reporting the metric at the end of each epoch, this time you'll accumulate all the batches with [~evaluate.add_batch] and calculate the metric at the very end. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_77.txt b/chunked/content_aware_chunking/_training/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef6360a5f910ad214bfd46db6e543c2d2aade28c --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_77.txt @@ -0,0 +1,19 @@ +import evaluate +metric = evaluate.load("accuracy") +model.eval() +for batch in eval_dataloader: + batch = {k: v.to(device) for k, v in batch.items()} + with torch.no_grad(): + outputs = model(**batch) + + logits = outputs.logits + predictions = torch.argmax(logits, dim=-1) + metric.add_batch(predictions=predictions, references=batch["labels"]) + +metric.compute() + +Additional resources +For more fine-tuning examples, refer to: + +🤗 Transformers Examples includes scripts + to train common NLP tasks in PyTorch and TensorFlow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_training/chunk_78.txt b/chunked/content_aware_chunking/_training/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ba2d934f6003ce4a2e8d06948ffc44b76937585 --- /dev/null +++ b/chunked/content_aware_chunking/_training/chunk_78.txt @@ -0,0 +1 @@ +🤗 Transformers Notebooks contains various notebooks on how to fine-tune a model for specific tasks in PyTorch and TensorFlow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_23.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..53bb0482891634a24b5c52b17bd38324380c71c8 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_23.txt @@ -0,0 +1,3 @@ +It +can perform one or several tasks in the same instruction (though the more complex your instruction, the more likely +the agent is to fail). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_24.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae50e89984b0b21d6b62d4b85c1a09a68d24aa7f --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_24.txt @@ -0,0 +1,4 @@ +py +agent.run("Draw me a picture of the sea then transform the picture to add an island") + +Every [~Agent.run] operation is independent, so you can run it several times in a row with different tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_25.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed6ab72a1f133ce6881bc73146327f802929466a --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_25.txt @@ -0,0 +1,2 @@ +Note that your agent is just a large-language model, so small variations in your prompt might yield completely +different results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_26.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..94a3c3e191a725c6e39354addcf00a5a5d0546a5 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_26.txt @@ -0,0 +1 @@ +It's important to explain as clearly as possible the task you want to perform. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_27.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..53309fd0acde6851f5561e9e091e89139954d182 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_27.txt @@ -0,0 +1,2 @@ +We go more in-depth +on how to write good prompts here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_28.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c00046f9a2cf271ef24aab23be0d8df037379b2 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_28.txt @@ -0,0 +1,2 @@ +If you'd like to keep a state across executions or to pass non-text objects to the agent, you can do so by specifying +variables that you would like the agent to use. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_29.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..8aaba36b9646a487600e1ea970df8992f213bda1 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_29.txt @@ -0,0 +1,4 @@ +For example, you could generate the first image of rivers and lakes, +and ask the model to update that picture to add an island by doing the following: +python +picture = agent.run("Generate a picture of rivers and lakes.") \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_30.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ced2b5d39ce0a4932d1c6600175c86453aabc2c --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_30.txt @@ -0,0 +1 @@ +updated_picture = agent.run("Transform the image in `picture` to add an island to it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_31.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a3a32e0b64a3add8d67ed812a99466e1f1b9ee4 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_31.txt @@ -0,0 +1,3 @@ +", picture=picture) + +This can be helpful when the model is unable to understand your request and mixes tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_32.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c90c2c95d31235c3a44e5dec56346a27a519f4c --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_32.txt @@ -0,0 +1,19 @@ +An example would be: +py +agent.run("Draw me the picture of a capybara swimming in the sea") +Here, the model could interpret in two ways: +- Have the text-to-image generate a capybara swimming in the sea +- Or, have the text-to-image generate capybara, then use the image-transformation tool to have it swim in the sea +In case you would like to force the first scenario, you could do so by passing it the prompt as an argument: +py +agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea") + +Chat-based execution (chat) +The agent also has a chat-based approach, using the [~Agent.chat] method: +py +agent.chat("Generate a picture of rivers and lakes") + +py +agent.chat("Transform the picture so that there is a rock in there") + +This is an interesting approach when you want to keep the state across instructions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_33.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a7c27481a7174575795028d27147c75bf96441a --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_33.txt @@ -0,0 +1,3 @@ +It's better for experimentation, +but will tend to be much better at single instructions rather than complex instructions (which the [~Agent.run] +method is better at handling). \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_34.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0758c54503c28668ba35a26737d9f6012cb8e30 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_34.txt @@ -0,0 +1 @@ +This method can also take arguments if you would like to pass non-text types or specific prompts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_35.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..14e135e69afef559c219914dc9946aea8a9183d3 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_35.txt @@ -0,0 +1,3 @@ +âš ï¸ Remote execution +For demonstration purposes and so that it could be used with all setups, we had created remote executors for several +of the default tools the agent has access for the release. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_36.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..52a38e7170303145e775ea96e1454c3b77503ae5 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_36.txt @@ -0,0 +1,2 @@ +These are created using +inference endpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_37.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..c566887af79bc3a61821e86ec58f3410398807b2 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_37.txt @@ -0,0 +1,2 @@ +We have turned these off for now, but in order to see how to set up remote executors tools yourself, +we recommend reading the custom tool guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_38.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..de7948666f995efc863dcfe05ee3fe03e1fc959e --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_38.txt @@ -0,0 +1 @@ +What's happening here? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_39.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..c90a55c66931a68db07fb3380f066300dd911439 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_39.txt @@ -0,0 +1 @@ +What are tools, and what are agents? \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_40.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fdc1b686386334550f7fcbd954e3be3463db573 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_40.txt @@ -0,0 +1,2 @@ +Agents +The "agent" here is a large language model, and we're prompting it so that it has access to a specific set of tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_41.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..8334a49b6de2ffc60975228a0a4510465404c5e4 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_41.txt @@ -0,0 +1,2 @@ +LLMs are pretty good at generating small samples of code, so this API takes advantage of that by prompting the +LLM gives a small sample of code performing a task with a set of tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_42.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..12640c131707194ffaadd06269e6c6745e5cdcbd --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_42.txt @@ -0,0 +1,2 @@ +This prompt is then completed by the +task you give your agent and the description of the tools you give it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_43.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..127ff670e7142b420276eae785ee4a0f6b435dd0 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_43.txt @@ -0,0 +1,2 @@ +This way it gets access to the doc of the +tools you are using, especially their expected inputs and outputs, and can generate the relevant code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_44.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..479aee5c25f394fa0653c660089597a032dc8a22 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_44.txt @@ -0,0 +1,2 @@ +Tools +Tools are very simple: they're a single function, with a name, and a description. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_45.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..8293d8432ff81f6c194f7f1b443924f47679485b --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_45.txt @@ -0,0 +1,2 @@ +We then use these tools' descriptions +to prompt the agent. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_46.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..a51ff444d4d0372742b4f13c75e5aaf19fc332af --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_46.txt @@ -0,0 +1,2 @@ +Through the prompt, we show the agent how it would leverage tools to perform what was +requested in the query. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_47.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d7180039f7a73f2384b781643fffe987afc0f2b --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_47.txt @@ -0,0 +1 @@ +This is using brand-new tools and not pipelines, because the agent writes better code with very atomic tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_48.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffaf5acc9437038256c01e5e31c0a10f5cea3fc8 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_48.txt @@ -0,0 +1 @@ +Pipelines are more refactored and often combine several tasks in one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_49.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..a06d5c7d3fd8f64b5a9ba2cff14b3b1a87034a16 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_49.txt @@ -0,0 +1,2 @@ +Tools are meant to be focused on +one very simple task only. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_50.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c535ac7900e1e0f0d66ff3e9dcfb5d739d057ab --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_50.txt @@ -0,0 +1 @@ +Code-execution?! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_51.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfb88226e7f9884c730e44362476e92c34cd23fa --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_51.txt @@ -0,0 +1 @@ +This code is then executed with our small Python interpreter on the set of inputs passed along with your tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_52.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..e68d7e95b499e7e50b7f2b331a092b996c4d3af3 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_52.txt @@ -0,0 +1 @@ +We hear you screaming "Arbitrary code execution!" \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_53.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..469957d93a3d276985d662f2f26a06e8cd12e017 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_53.txt @@ -0,0 +1 @@ +in the back, but let us explain why that is not the case. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_54.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..878da3b367e6150f019789ac0fdcb82956579d19 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_54.txt @@ -0,0 +1,2 @@ +The only functions that can be called are the tools you provided and the print function, so you're already +limited in what can be executed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_55.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..9318a4ce92bc23b215d43229da480ad4a00e5bd1 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_55.txt @@ -0,0 +1 @@ +You should be safe if it's limited to Hugging Face tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_56.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..43c8a19dfa09037bee2f9dac136b498fe26bfeac --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_56.txt @@ -0,0 +1,3 @@ +Then, we don't allow any attribute lookup or imports (which shouldn't be needed anyway for passing along +inputs/outputs to a small set of functions) so all the most obvious attacks (and you'd need to prompt the LLM +to output them anyway) shouldn't be an issue. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_57.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..e825955a78d3fa090efbcd65cfe50051b5945186 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_57.txt @@ -0,0 +1,3 @@ +If you want to be on the super safe side, you can execute the +run() method with the additional argument return_code=True, in which case the agent will just return the code +to execute and you can decide whether to do it or not. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_58.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..0550477c29725dda8ead77f27b17d64712e38aab --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_58.txt @@ -0,0 +1,2 @@ +The execution will stop at any line trying to perform an illegal operation or if there is a regular Python error +with the code generated by the agent. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_59.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..505d9b7451b485e59c0bef18acb76de464895a9d --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_59.txt @@ -0,0 +1,2 @@ +A curated set of tools +We identify a set of tools that can empower such agents. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_60.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0e31c3e830eb476ae4b5b01ece4a81d598a9adc --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_60.txt @@ -0,0 +1,6 @@ +Here is an updated list of the tools we have integrated +in transformers: + +Document question answering: given a document (such as a PDF) in image format, answer a question on this document (Donut) +Text question answering: given a long text and a question, answer the question in the text (Flan-T5) +Unconditional image captioning: Caption the image! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_61.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5e68af6a470fa697eb921d207a1d802146d288c --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_61.txt @@ -0,0 +1,18 @@ +(BLIP) +Image question answering: given an image, answer a question on this image (VILT) +Image segmentation: given an image and a prompt, output the segmentation mask of that prompt (CLIPSeg) +Speech to text: given an audio recording of a person talking, transcribe the speech into text (Whisper) +Text to speech: convert text to speech (SpeechT5) +Zero-shot text classification: given a text and a list of labels, identify to which label the text corresponds the most (BART) +Text summarization: summarize a long text in one or a few sentences (BART) +Translation: translate the text into a given language (NLLB) + +These tools have an integration in transformers, and can be used manually as well, for example: + +from transformers import load_tool +tool = load_tool("text-to-speech") +audio = tool("This is a text to speech tool") + +Custom tools +While we identify a curated set of tools, we strongly believe that the main value provided by this implementation is +the ability to quickly create and share custom tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_62.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c545d9c1a79adaf820beef4682b50c639313c31 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_62.txt @@ -0,0 +1,2 @@ +By pushing the code of a tool to a Hugging Face Space or a model repository, you're then able to leverage the tool +directly with the agent. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_63.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc35813b5695c7181a50709f2680b2bb7a45ad3d --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_63.txt @@ -0,0 +1,10 @@ +We've added a few +transformers-agnostic tools to the huggingface-tools organization: + +Text downloader: to download a text from a web URL +Text to image: generate an image according to a prompt, leveraging stable diffusion +Image transformation: modify an image given an initial image and a prompt, leveraging instruct pix2pix stable diffusion +Text to video: generate a small video according to a prompt, leveraging damo-vilab + +The text-to-image tool we have been using since the beginning is a remote tool that lives in +huggingface-tools/text-to-image! \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_64.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..b092ea04b5a2527abd7618ce0389b7979e80a8e3 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_64.txt @@ -0,0 +1,2 @@ +We will +continue releasing such tools on this and other organizations, to further supercharge this implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_65.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..88dd98608a906607dcd7a05a92282b381e81028b --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_65.txt @@ -0,0 +1 @@ +The agents have by default access to tools that reside on huggingface-tools. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_66.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..1743588c1148b14460afcf28523db09d51ed137a --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_66.txt @@ -0,0 +1 @@ +We explain how to you can write and share your tools as well as leverage any custom tool that resides on the Hub in following guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_67.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..f720d5f8b0182b2dae075136dfd4c7d3b2c87a1e --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_67.txt @@ -0,0 +1,2 @@ +Code generation +So far we have shown how to use the agents to perform actions for you. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_68.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..639b8bccfc871e816f46c2a92706109c03ead354 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_68.txt @@ -0,0 +1,2 @@ +However, the agent is only generating code +that we then execute using a very restricted Python interpreter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_69.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..9101c09a72f1483795a6d151b914a5aef2d5495c --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_69.txt @@ -0,0 +1,2 @@ +In case you would like to use the code generated in +a different setting, the agent can be prompted to return the code, along with tool definition and accurate imports. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_transformers_agents/chunk_70.txt b/chunked/content_aware_chunking/_transformers_agents/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..4373aeaec42b943d65e7f07969a4672a817893f2 --- /dev/null +++ b/chunked/content_aware_chunking/_transformers_agents/chunk_70.txt @@ -0,0 +1,10 @@ +For example, the following instruction +python +agent.run("Draw me a picture of rivers and lakes", return_code=True) +returns the following code +thon +from transformers import load_tool +image_generator = load_tool("huggingface-tools/text-to-image") +image = image_generator(prompt="rivers and lakes") + +that you can then modify and execute yourself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_14.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff9efd3b9ae550a04f46aca3426493e77f37c3d5 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_14.txt @@ -0,0 +1,2 @@ +CUDA out of memory +Training large models with millions of parameters can be challenging without the appropriate hardware. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_15.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..23d3b8f20c20fcc2d438e0cf4ae2f0db906af801 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_15.txt @@ -0,0 +1,2 @@ +A common error you may encounter when the GPU runs out of memory is: +CUDA out of memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_16.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..035b7d047bcd1c8a9714fb037761835ffd6a9162 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_16.txt @@ -0,0 +1,4 @@ +Tried to allocate 256.00 MiB (GPU 0; 11.17 GiB total capacity; 9.70 GiB already allocated; 179.81 MiB free; 9.85 GiB reserved in total by PyTorch) +Here are some potential solutions you can try to lessen memory use: + +Reduce the per_device_train_batch_size value in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_17.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..e695ef98f192ec9b1ede7fec17ebd6bc4aef4fa7 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_17.txt @@ -0,0 +1 @@ +Try using gradient_accumulation_steps in [TrainingArguments] to effectively increase overall batch size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_18.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8112dcafc541d9971d76032a68753e448ae47a85 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_18.txt @@ -0,0 +1 @@ +Refer to the Performance guide for more details about memory-saving techniques. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_19.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe6f923be60ee1f8634574562078fe5ccc908d5f --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_19.txt @@ -0,0 +1,2 @@ +Unable to load a saved TensorFlow model +TensorFlow's model.save method will save the entire model - architecture, weights, training configuration - in a single file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_20.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee06473d50172eaab014a2a595b6065143c8eb09 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_20.txt @@ -0,0 +1 @@ +However, when you load the model file again, you may run into an error because 🤗 Transformers may not load all the TensorFlow-related objects in the model file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_21.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d539364302038e2d9a1786ae22e33d83dde822cd --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_21.txt @@ -0,0 +1,23 @@ +To avoid issues with saving and loading TensorFlow models, we recommend you: + +Save the model weights as a h5 file extension with model.save_weights and then reload the model with [~TFPreTrainedModel.from_pretrained]: + +from transformers import TFPreTrainedModel +from tensorflow import keras +model.save_weights("some_folder/tf_model.h5") +model = TFPreTrainedModel.from_pretrained("some_folder") + +Save the model with [~TFPretrainedModel.save_pretrained] and load it again with [~TFPreTrainedModel.from_pretrained]: + +from transformers import TFPreTrainedModel +model.save_pretrained("path_to/model") +model = TFPreTrainedModel.from_pretrained("path_to/model") + +ImportError +Another common error you may encounter, especially if it is a newly released model, is ImportError: +ImportError: cannot import name 'ImageGPTImageProcessor' from 'transformers' (unknown location) +For these error types, check to make sure you have the latest version of 🤗 Transformers installed to access the most recent models: + +pip install transformers --upgrade +CUDA error: device-side assert triggered +Sometimes you may run into a generic CUDA error about an error in the device code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_22.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0273a7be6467146266fd62ceece0958787b76ec --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_22.txt @@ -0,0 +1,2 @@ +RuntimeError: CUDA error: device-side assert triggered +You should try to run the code on a CPU first to get a more descriptive error message. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_23.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..7750b2ce0df337b4b032622d76f2cdc56ecaab6f --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_23.txt @@ -0,0 +1,6 @@ +Add the following environment variable to the beginning of your code to switch to a CPU: + +import os +os.environ["CUDA_VISIBLE_DEVICES"] = "" + +Another option is to get a better traceback from the GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_24.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa80673e7f0f8717bbf199d283c753cb7a324fa5 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_24.txt @@ -0,0 +1,7 @@ +Add the following environment variable to the beginning of your code to get the traceback to point to the source of the error: + +import os +os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + +Incorrect output when padding tokens aren't masked +In some cases, the output hidden_state may be incorrect if the input_ids include padding tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_25.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..b49c8016cd9c585c06b0d4654ebdeb9aae4ba4e4 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_25.txt @@ -0,0 +1 @@ +To demonstrate, load a model and tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_26.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2f8dadc79d6161b1bb5fec21817a0d26d3f0c81 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_26.txt @@ -0,0 +1 @@ +You can access a model's pad_token_id to see its value. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_27.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c439c94e88a58b6e11346b50271c7076049ae20 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_27.txt @@ -0,0 +1 @@ +The pad_token_id may be None for some models, but you can always manually set it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_28.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad61a144486cfc50d52edd9b520d74c1b4e00551 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_28.txt @@ -0,0 +1,22 @@ +from transformers import AutoModelForSequenceClassification +import torch +model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-uncased") +model.config.pad_token_id +0 + +The following example shows the output without masking the padding tokens: + +input_ids = torch.tensor([[7592, 2057, 2097, 2393, 9611, 2115], [7592, 0, 0, 0, 0, 0]]) +output = model(input_ids) +print(output.logits) +tensor([[ 0.0082, -0.2307], + [ 0.1317, -0.1683]], grad_fn=) + +Here is the actual output of the second sequence: + +input_ids = torch.tensor([[7592]]) +output = model(input_ids) +print(output.logits) +tensor([[-0.1008, -0.4061]], grad_fn=) + +Most of the time, you should provide an attention_mask to your model to ignore the padding tokens to avoid this silent error. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_29.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdc791ee9df438f4ac4a289071473f6fd620bd10 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_29.txt @@ -0,0 +1,3 @@ +Now the output of the second sequence matches its actual output: + +By default, the tokenizer creates an attention_mask for you based on your specific tokenizer's defaults. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_30.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..dad529aff58d9260f188264ab8bf3c5f03d27a1b --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_30.txt @@ -0,0 +1,9 @@ +attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1], [1, 0, 0, 0, 0, 0]]) +output = model(input_ids, attention_mask=attention_mask) +print(output.logits) +tensor([[ 0.0082, -0.2307], + [-0.1008, -0.4061]], grad_fn=) + +🤗 Transformers doesn't automatically create an attention_mask to mask a padding token if it is provided because: + +Some models don't have a padding token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_31.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..41fb44335bf9ac5cd01dd3fa25759eed6c40b7d1 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_31.txt @@ -0,0 +1 @@ +For some use-cases, users want a model to attend to a padding token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_32.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..c02536ed2a864d8c50f5512106d8e886ef72b95b --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_32.txt @@ -0,0 +1,2 @@ +ValueError: Unrecognized configuration class XYZ for this kind of AutoModel +Generally, we recommend using the [AutoModel] class to load pretrained instances of models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_33.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..478a19cd6792544685dbd91b2320061850a9652a --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_33.txt @@ -0,0 +1,2 @@ +This class +can automatically infer and load the correct architecture from a given checkpoint based on the configuration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_34.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..02b480d439e3b3b3aadaf662d55817e45186f668 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_34.txt @@ -0,0 +1,3 @@ +If you see +this ValueError when loading a model from a checkpoint, this means the Auto class couldn't find a mapping from +the configuration in the given checkpoint to the kind of model you are trying to load. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_35.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea8221aa38e707a9201685d3a4b4f8ecea72299d --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_35.txt @@ -0,0 +1,2 @@ +Most commonly, this happens when a +checkpoint doesn't support a given task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_36.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b97898b8e29ee87cb2c82283be4e7f255a32f96 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_36.txt @@ -0,0 +1,6 @@ +For instance, you'll see this error in the following example because there is no GPT2 for question answering: + +from transformers import AutoProcessor, AutoModelForQuestionAnswering +processor = AutoProcessor.from_pretrained("openai-community/gpt2-medium") +model = AutoModelForQuestionAnswering.from_pretrained("openai-community/gpt2-medium") +ValueError: Unrecognized configuration class for this kind of AutoModel: AutoModelForQuestionAnswering. \ No newline at end of file diff --git a/chunked/content_aware_chunking/_troubleshooting/chunk_37.txt b/chunked/content_aware_chunking/_troubleshooting/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..413745cf04124eeb887e7921ebce79f25da61343 --- /dev/null +++ b/chunked/content_aware_chunking/_troubleshooting/chunk_37.txt @@ -0,0 +1 @@ +Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_audio_utils/chunk_2.txt b/chunked/content_aware_chunking/internal_audio_utils/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..a075d672170fa11d0a76d81990f0df0b06289f02 --- /dev/null +++ b/chunked/content_aware_chunking/internal_audio_utils/chunk_2.txt @@ -0,0 +1,9 @@ +Audio Transformations +[[autodoc]] audio_utils.hertz_to_mel +[[autodoc]] audio_utils.mel_to_hertz +[[autodoc]] audio_utils.mel_filter_bank +[[autodoc]] audio_utils.optimal_fft_length +[[autodoc]] audio_utils.window_function +[[autodoc]] audio_utils.spectrogram +[[autodoc]] audio_utils.power_to_db +[[autodoc]] audio_utils.amplitude_to_db \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_file_utils/chunk_2.txt b/chunked/content_aware_chunking/internal_file_utils/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..276fcc4c6f40fb358e1d0acb17ea95f9a4a31832 --- /dev/null +++ b/chunked/content_aware_chunking/internal_file_utils/chunk_2.txt @@ -0,0 +1,14 @@ +Enums and namedtuples +[[autodoc]] utils.ExplicitEnum +[[autodoc]] utils.PaddingStrategy +[[autodoc]] utils.TensorType +Special Decorators +[[autodoc]] utils.add_start_docstrings +[[autodoc]] utils.add_start_docstrings_to_model_forward +[[autodoc]] utils.add_end_docstrings +[[autodoc]] utils.add_code_sample_docstrings +[[autodoc]] utils.replace_return_docstrings +Special Properties +[[autodoc]] utils.cached_property +Other Utilities +[[autodoc]] utils._LazyModule \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_generation_utils/chunk_10.txt b/chunked/content_aware_chunking/internal_generation_utils/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6c77a2ae0e942fd4e3a034a9c66327c953ee415 --- /dev/null +++ b/chunked/content_aware_chunking/internal_generation_utils/chunk_10.txt @@ -0,0 +1 @@ +Here, for instance, it has two keys that are sequences and scores. \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_generation_utils/chunk_11.txt b/chunked/content_aware_chunking/internal_generation_utils/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb9f48a41ed6c4f2182692d1ce4b366facf71a59 --- /dev/null +++ b/chunked/content_aware_chunking/internal_generation_utils/chunk_11.txt @@ -0,0 +1 @@ +We document here all output types. \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_generation_utils/chunk_12.txt b/chunked/content_aware_chunking/internal_generation_utils/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..66575ab0767965630428b1e36ef8245e0c1879d0 --- /dev/null +++ b/chunked/content_aware_chunking/internal_generation_utils/chunk_12.txt @@ -0,0 +1,23 @@ +PyTorch +[[autodoc]] generation.GenerateDecoderOnlyOutput +[[autodoc]] generation.GenerateEncoderDecoderOutput +[[autodoc]] generation.GenerateBeamDecoderOnlyOutput +[[autodoc]] generation.GenerateBeamEncoderDecoderOutput +TensorFlow +[[autodoc]] generation.TFGreedySearchEncoderDecoderOutput +[[autodoc]] generation.TFGreedySearchDecoderOnlyOutput +[[autodoc]] generation.TFSampleEncoderDecoderOutput +[[autodoc]] generation.TFSampleDecoderOnlyOutput +[[autodoc]] generation.TFBeamSearchEncoderDecoderOutput +[[autodoc]] generation.TFBeamSearchDecoderOnlyOutput +[[autodoc]] generation.TFBeamSampleEncoderDecoderOutput +[[autodoc]] generation.TFBeamSampleDecoderOnlyOutput +[[autodoc]] generation.TFContrastiveSearchEncoderDecoderOutput +[[autodoc]] generation.TFContrastiveSearchDecoderOnlyOutput +FLAX +[[autodoc]] generation.FlaxSampleOutput +[[autodoc]] generation.FlaxGreedySearchOutput +[[autodoc]] generation.FlaxBeamSearchOutput +LogitsProcessor +A [LogitsProcessor] can be used to modify the prediction scores of a language model head for +generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_generation_utils/chunk_13.txt b/chunked/content_aware_chunking/internal_generation_utils/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e7ab9dca306d9791199eb24f68b738d2b30fbff --- /dev/null +++ b/chunked/content_aware_chunking/internal_generation_utils/chunk_13.txt @@ -0,0 +1,123 @@ +PyTorch +[[autodoc]] AlternatingCodebooksLogitsProcessor + - call +[[autodoc]] ClassifierFreeGuidanceLogitsProcessor + - call +[[autodoc]] EncoderNoRepeatNGramLogitsProcessor + - call +[[autodoc]] EncoderRepetitionPenaltyLogitsProcessor + - call +[[autodoc]] EpsilonLogitsWarper + - call +[[autodoc]] EtaLogitsWarper + - call +[[autodoc]] ExponentialDecayLengthPenalty + - call +[[autodoc]] ForcedBOSTokenLogitsProcessor + - call +[[autodoc]] ForcedEOSTokenLogitsProcessor + - call +[[autodoc]] ForceTokensLogitsProcessor + - call +[[autodoc]] HammingDiversityLogitsProcessor + - call +[[autodoc]] InfNanRemoveLogitsProcessor + - call +[[autodoc]] LogitNormalization + - call +[[autodoc]] LogitsProcessor + - call +[[autodoc]] LogitsProcessorList + - call +[[autodoc]] LogitsWarper + - call +[[autodoc]] MinLengthLogitsProcessor + - call +[[autodoc]] MinNewTokensLengthLogitsProcessor + - call +[[autodoc]] NoBadWordsLogitsProcessor + - call +[[autodoc]] NoRepeatNGramLogitsProcessor + - call +[[autodoc]] PrefixConstrainedLogitsProcessor + - call +[[autodoc]] RepetitionPenaltyLogitsProcessor + - call +[[autodoc]] SequenceBiasLogitsProcessor + - call +[[autodoc]] SuppressTokensAtBeginLogitsProcessor + - call +[[autodoc]] SuppressTokensLogitsProcessor + - call +[[autodoc]] TemperatureLogitsWarper + - call +[[autodoc]] TopKLogitsWarper + - call +[[autodoc]] TopPLogitsWarper + - call +[[autodoc]] TypicalLogitsWarper + - call +[[autodoc]] UnbatchedClassifierFreeGuidanceLogitsProcessor + - call +[[autodoc]] WhisperTimeStampLogitsProcessor + - call +TensorFlow +[[autodoc]] TFForcedBOSTokenLogitsProcessor + - call +[[autodoc]] TFForcedEOSTokenLogitsProcessor + - call +[[autodoc]] TFForceTokensLogitsProcessor + - call +[[autodoc]] TFLogitsProcessor + - call +[[autodoc]] TFLogitsProcessorList + - call +[[autodoc]] TFLogitsWarper + - call +[[autodoc]] TFMinLengthLogitsProcessor + - call +[[autodoc]] TFNoBadWordsLogitsProcessor + - call +[[autodoc]] TFNoRepeatNGramLogitsProcessor + - call +[[autodoc]] TFRepetitionPenaltyLogitsProcessor + - call +[[autodoc]] TFSuppressTokensAtBeginLogitsProcessor + - call +[[autodoc]] TFSuppressTokensLogitsProcessor + - call +[[autodoc]] TFTemperatureLogitsWarper + - call +[[autodoc]] TFTopKLogitsWarper + - call +[[autodoc]] TFTopPLogitsWarper + - call +FLAX +[[autodoc]] FlaxForcedBOSTokenLogitsProcessor + - call +[[autodoc]] FlaxForcedEOSTokenLogitsProcessor + - call +[[autodoc]] FlaxForceTokensLogitsProcessor + - call +[[autodoc]] FlaxLogitsProcessor + - call +[[autodoc]] FlaxLogitsProcessorList + - call +[[autodoc]] FlaxLogitsWarper + - call +[[autodoc]] FlaxMinLengthLogitsProcessor + - call +[[autodoc]] FlaxSuppressTokensAtBeginLogitsProcessor + - call +[[autodoc]] FlaxSuppressTokensLogitsProcessor + - call +[[autodoc]] FlaxTemperatureLogitsWarper + - call +[[autodoc]] FlaxTopKLogitsWarper + - call +[[autodoc]] FlaxTopPLogitsWarper + - call +[[autodoc]] FlaxWhisperTimeStampLogitsProcessor + - call +StoppingCriteria +A [StoppingCriteria] can be used to change when to stop generation (other than EOS token). \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_generation_utils/chunk_14.txt b/chunked/content_aware_chunking/internal_generation_utils/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fd95b827464e604b4a0bb368e464f87f124121a --- /dev/null +++ b/chunked/content_aware_chunking/internal_generation_utils/chunk_14.txt @@ -0,0 +1 @@ +Please note that this is exclusively available to our PyTorch implementations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_generation_utils/chunk_15.txt b/chunked/content_aware_chunking/internal_generation_utils/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..08c868c1f2ac3976eff5103034e3b113a3e3a8e6 --- /dev/null +++ b/chunked/content_aware_chunking/internal_generation_utils/chunk_15.txt @@ -0,0 +1,10 @@ +[[autodoc]] StoppingCriteria + - call +[[autodoc]] StoppingCriteriaList + - call +[[autodoc]] MaxLengthCriteria + - call +[[autodoc]] MaxTimeCriteria + - call +Constraints +A [Constraint] can be used to force the generation to include specific tokens or sequences in the output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_generation_utils/chunk_16.txt b/chunked/content_aware_chunking/internal_generation_utils/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fd95b827464e604b4a0bb368e464f87f124121a --- /dev/null +++ b/chunked/content_aware_chunking/internal_generation_utils/chunk_16.txt @@ -0,0 +1 @@ +Please note that this is exclusively available to our PyTorch implementations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_generation_utils/chunk_17.txt b/chunked/content_aware_chunking/internal_generation_utils/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1bbd5680aa709a64cb2a641a058a406f57c7692 --- /dev/null +++ b/chunked/content_aware_chunking/internal_generation_utils/chunk_17.txt @@ -0,0 +1,36 @@ +[[autodoc]] Constraint +[[autodoc]] PhrasalConstraint +[[autodoc]] DisjunctiveConstraint +[[autodoc]] ConstraintListState +BeamSearch +[[autodoc]] BeamScorer + - process + - finalize +[[autodoc]] BeamSearchScorer + - process + - finalize +[[autodoc]] ConstrainedBeamSearchScorer + - process + - finalize +Utilities +[[autodoc]] top_k_top_p_filtering +[[autodoc]] tf_top_k_top_p_filtering +Streamers +[[autodoc]] TextStreamer +[[autodoc]] TextIteratorStreamer +Caches +[[autodoc]] Cache + - update +[[autodoc]] DynamicCache + - update + - get_seq_length + - reorder_cache + - to_legacy_cache + - from_legacy_cache +[[autodoc]] SinkCache + - update + - get_seq_length + - reorder_cache +[[autodoc]] StaticCache + - update + - get_seq_length \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_generation_utils/chunk_7.txt b/chunked/content_aware_chunking/internal_generation_utils/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..88291563c0fd3b26df6f10bad64ce71374f7efa0 --- /dev/null +++ b/chunked/content_aware_chunking/internal_generation_utils/chunk_7.txt @@ -0,0 +1 @@ +When using our generation_output object as a tuple, it only keeps the attributes that don't have None values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_generation_utils/chunk_8.txt b/chunked/content_aware_chunking/internal_generation_utils/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..4308ef954ca1a3ac37ad57bb4322cd35ce7d3ad0 --- /dev/null +++ b/chunked/content_aware_chunking/internal_generation_utils/chunk_8.txt @@ -0,0 +1,4 @@ +Here, for instance, it has two elements, loss then logits, so +python +generation_output[:2] +will return the tuple (generation_output.sequences, generation_output.scores) for instance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_generation_utils/chunk_9.txt b/chunked/content_aware_chunking/internal_generation_utils/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..430aa58713ea94719a2bfe61b55573c94769518f --- /dev/null +++ b/chunked/content_aware_chunking/internal_generation_utils/chunk_9.txt @@ -0,0 +1,2 @@ +When using our generation_output object as a dictionary, it only keeps the attributes that don't have None +values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_image_processing_utils/chunk_2.txt b/chunked/content_aware_chunking/internal_image_processing_utils/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..56cd1b2c7cee6c2bee6205724797b1101911d244 --- /dev/null +++ b/chunked/content_aware_chunking/internal_image_processing_utils/chunk_2.txt @@ -0,0 +1,13 @@ +Image Transformations +[[autodoc]] image_transforms.center_crop +[[autodoc]] image_transforms.center_to_corners_format +[[autodoc]] image_transforms.corners_to_center_format +[[autodoc]] image_transforms.id_to_rgb +[[autodoc]] image_transforms.normalize +[[autodoc]] image_transforms.pad +[[autodoc]] image_transforms.rgb_to_id +[[autodoc]] image_transforms.rescale +[[autodoc]] image_transforms.resize +[[autodoc]] image_transforms.to_pil_image +ImageProcessingMixin +[[autodoc]] image_processing_utils.ImageProcessingMixin \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_modeling_utils/chunk_2.txt b/chunked/content_aware_chunking/internal_modeling_utils/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..76c69d6e97a4ee39c90f6b726eb353859ca2e0b5 --- /dev/null +++ b/chunked/content_aware_chunking/internal_modeling_utils/chunk_2.txt @@ -0,0 +1,33 @@ +Pytorch custom modules +[[autodoc]] pytorch_utils.Conv1D +[[autodoc]] modeling_utils.PoolerStartLogits + - forward +[[autodoc]] modeling_utils.PoolerEndLogits + - forward +[[autodoc]] modeling_utils.PoolerAnswerClass + - forward +[[autodoc]] modeling_utils.SquadHeadOutput +[[autodoc]] modeling_utils.SQuADHead + - forward +[[autodoc]] modeling_utils.SequenceSummary + - forward +PyTorch Helper Functions +[[autodoc]] pytorch_utils.apply_chunking_to_forward +[[autodoc]] pytorch_utils.find_pruneable_heads_and_indices +[[autodoc]] pytorch_utils.prune_layer +[[autodoc]] pytorch_utils.prune_conv1d_layer +[[autodoc]] pytorch_utils.prune_linear_layer +TensorFlow custom layers +[[autodoc]] modeling_tf_utils.TFConv1D +[[autodoc]] modeling_tf_utils.TFSequenceSummary +TensorFlow loss functions +[[autodoc]] modeling_tf_utils.TFCausalLanguageModelingLoss +[[autodoc]] modeling_tf_utils.TFMaskedLanguageModelingLoss +[[autodoc]] modeling_tf_utils.TFMultipleChoiceLoss +[[autodoc]] modeling_tf_utils.TFQuestionAnsweringLoss +[[autodoc]] modeling_tf_utils.TFSequenceClassificationLoss +[[autodoc]] modeling_tf_utils.TFTokenClassificationLoss +TensorFlow Helper Functions +[[autodoc]] modeling_tf_utils.get_initializer +[[autodoc]] modeling_tf_utils.keras_serializable +[[autodoc]] modeling_tf_utils.shape_list \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_pipelines_utils/chunk_2.txt b/chunked/content_aware_chunking/internal_pipelines_utils/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8c688edf961a9c28aead7d56a7f8536be52fa71 --- /dev/null +++ b/chunked/content_aware_chunking/internal_pipelines_utils/chunk_2.txt @@ -0,0 +1,11 @@ +Argument handling +[[autodoc]] pipelines.ArgumentHandler +[[autodoc]] pipelines.ZeroShotClassificationArgumentHandler +[[autodoc]] pipelines.QuestionAnsweringArgumentHandler +Data format +[[autodoc]] pipelines.PipelineDataFormat +[[autodoc]] pipelines.CsvPipelineDataFormat +[[autodoc]] pipelines.JsonPipelineDataFormat +[[autodoc]] pipelines.PipedPipelineDataFormat +Utilities +[[autodoc]] pipelines.PipelineException \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_time_series_utils/chunk_1.txt b/chunked/content_aware_chunking/internal_time_series_utils/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1d922b6c78e368f129ea9fd8235af36483a1d54 --- /dev/null +++ b/chunked/content_aware_chunking/internal_time_series_utils/chunk_1.txt @@ -0,0 +1 @@ +Most of those are only useful if you are studying the code of the time series models or you wish to add to the collection of distributional output classes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_time_series_utils/chunk_2.txt b/chunked/content_aware_chunking/internal_time_series_utils/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..adfd810e66d450d9b4030ded7e8fddb5951f476c --- /dev/null +++ b/chunked/content_aware_chunking/internal_time_series_utils/chunk_2.txt @@ -0,0 +1,4 @@ +Distributional Output +[[autodoc]] time_series_utils.NormalOutput +[[autodoc]] time_series_utils.StudentTOutput +[[autodoc]] time_series_utils.NegativeBinomialOutput \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_tokenization_utils/chunk_2.txt b/chunked/content_aware_chunking/internal_tokenization_utils/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..15ad0f1df8584ba5bce64bbf2e2bf2740da1b19e --- /dev/null +++ b/chunked/content_aware_chunking/internal_tokenization_utils/chunk_2.txt @@ -0,0 +1,10 @@ +PreTrainedTokenizerBase +[[autodoc]] tokenization_utils_base.PreTrainedTokenizerBase + - call + - all +SpecialTokensMixin +[[autodoc]] tokenization_utils_base.SpecialTokensMixin +Enums and namedtuples +[[autodoc]] tokenization_utils_base.TruncationStrategy +[[autodoc]] tokenization_utils_base.CharSpan +[[autodoc]] tokenization_utils_base.TokenSpan \ No newline at end of file diff --git a/chunked/content_aware_chunking/internal_trainer_utils/chunk_2.txt b/chunked/content_aware_chunking/internal_trainer_utils/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..21e042c7122ba2bc484e4f644a67a3aec95fe8b6 --- /dev/null +++ b/chunked/content_aware_chunking/internal_trainer_utils/chunk_2.txt @@ -0,0 +1,14 @@ +Utilities +[[autodoc]] EvalPrediction +[[autodoc]] IntervalStrategy +[[autodoc]] enable_full_determinism +[[autodoc]] set_seed +[[autodoc]] torch_distributed_zero_first +Callbacks internals +[[autodoc]] trainer_callback.CallbackHandler +Distributed Evaluation +[[autodoc]] trainer_pt_utils.DistributedTensorGatherer +Trainer Argument Parser +[[autodoc]] HfArgumentParser +Debug Utilities +[[autodoc]] debug_utils.DebugUnderflowOverflow \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_agent/chunk_3.txt b/chunked/content_aware_chunking/main_classes_agent/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcf5fa6b6a69dbb61eb4394373eb296604ee8021 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_agent/chunk_3.txt @@ -0,0 +1,2 @@ +This page +contains the API docs for the underlying classes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_agent/chunk_4.txt b/chunked/content_aware_chunking/main_classes_agent/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f0878579d763415a751c8adf7079f4702e9b708 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_agent/chunk_4.txt @@ -0,0 +1,2 @@ +Agents +We provide three types of agents: [HfAgent] uses inference endpoints for opensource models, [LocalAgent] uses a model of your choice locally and [OpenAiAgent] uses OpenAI closed models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_agent/chunk_5.txt b/chunked/content_aware_chunking/main_classes_agent/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdce0b175640d86c607de83832e12f81765cead3 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_agent/chunk_5.txt @@ -0,0 +1,27 @@ +HfAgent +[[autodoc]] HfAgent +LocalAgent +[[autodoc]] LocalAgent +OpenAiAgent +[[autodoc]] OpenAiAgent +AzureOpenAiAgent +[[autodoc]] AzureOpenAiAgent +Agent +[[autodoc]] Agent + - chat + - run + - prepare_for_new_chat +Tools +load_tool +[[autodoc]] load_tool +Tool +[[autodoc]] Tool +PipelineTool +[[autodoc]] PipelineTool +RemoteTool +[[autodoc]] RemoteTool +launch_gradio_demo +[[autodoc]] launch_gradio_demo +Agent Types +Agents can handle any type of object in-between tools; tools, being completely multimodal, can accept and return +text, image, audio, video, among other types. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_agent/chunk_6.txt b/chunked/content_aware_chunking/main_classes_agent/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac1a4a9c73e47158209bf335edd88c873cf5de8c --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_agent/chunk_6.txt @@ -0,0 +1,3 @@ +In order to increase compatibility between tools, as well as to +correctly render these returns in ipython (jupyter, colab, ipython notebooks, ), we implement wrapper classes +around these types. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_agent/chunk_7.txt b/chunked/content_aware_chunking/main_classes_agent/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..9252263e365e9320948a8dbd9ec99462738c93f8 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_agent/chunk_7.txt @@ -0,0 +1,2 @@ +The wrapped objects should continue behaving as initially; a text object should still behave as a string, an image +object should still behave as a PIL.Image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_agent/chunk_8.txt b/chunked/content_aware_chunking/main_classes_agent/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..cef85777005f6c0d869bf801a2768dab88ec7529 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_agent/chunk_8.txt @@ -0,0 +1,13 @@ +These types have three specific purposes: + +Calling to_raw on the type should return the underlying object +Calling to_string on the type should return the object as a string: that can be the string in case of an AgentText + but will be the path of the serialized version of the object in other instances +Displaying it in an ipython kernel should display the object correctly + +AgentText +[[autodoc]] transformers.tools.agent_types.AgentText +AgentImage +[[autodoc]] transformers.tools.agent_types.AgentImage +AgentAudio +[[autodoc]] transformers.tools.agent_types.AgentAudio \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_backbones/chunk_2.txt b/chunked/content_aware_chunking/main_classes_backbones/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..40b069c52168cd6d8306c9c8ce5a338d842f4358 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_backbones/chunk_2.txt @@ -0,0 +1 @@ +[~utils.BackboneConfigMixin] sets the output features and indices of the backbone configuration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_backbones/chunk_3.txt b/chunked/content_aware_chunking/main_classes_backbones/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..21717f64774f0167ed700cbf2fbb4048d13dd404 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_backbones/chunk_3.txt @@ -0,0 +1 @@ +timm models are loaded with the [TimmBackbone] and [TimmBackboneConfig] classes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_backbones/chunk_4.txt b/chunked/content_aware_chunking/main_classes_backbones/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c12f59e4fadea484e18522f5491e17d2e6398a5 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_backbones/chunk_4.txt @@ -0,0 +1,26 @@ +Backbones are supported for the following models: + +BEiT +BiT +ConvNet +ConvNextV2 +DiNAT +DINOV2 +FocalNet +MaskFormer +NAT +ResNet +Swin Transformer +Swin Transformer v2 +ViTDet + +AutoBackbone +[[autodoc]] AutoBackbone +BackboneMixin +[[autodoc]] utils.BackboneMixin +BackboneConfigMixin +[[autodoc]] utils.BackboneConfigMixin +TimmBackbone +[[autodoc]] models.timm_backbone.TimmBackbone +TimmBackboneConfig +[[autodoc]] models.timm_backbone.TimmBackboneConfig \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_10.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ee40f35883b5185fe4ff52d3ada0c441f652e1f --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_10.txt @@ -0,0 +1 @@ +[~integrations.NeptuneCallback] if neptune is installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_11.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed24c52ec0910c617263c04c464ccc0017baa732 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_11.txt @@ -0,0 +1,2 @@ +[~integrations.AzureMLCallback] if azureml-sdk is + installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_12.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6a250344b9f4fb82114942d7fb76b816dd5dc14 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_12.txt @@ -0,0 +1,2 @@ +[~integrations.CodeCarbonCallback] if codecarbon is + installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_13.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..8508c4e3dc77a9cda8feec946836512047c213b5 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_13.txt @@ -0,0 +1 @@ +[~integrations.ClearMLCallback] if clearml is installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_14.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a215b46be1d5a92f6ff9c22e2d9fff939d010db6 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_14.txt @@ -0,0 +1 @@ +[~integrations.DagsHubCallback] if dagshub is installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_15.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f3ab061f19412f413f0869a2c0b4e6ff5b1b0a2 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_15.txt @@ -0,0 +1 @@ +[~integrations.FlyteCallback] if flyte is installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_16.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..14da9e7d213727012654acf294864f95c9f9556c --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_16.txt @@ -0,0 +1 @@ +[~integrations.DVCLiveCallback] if dvclive is installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_17.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e5a282b7bd694a4dec216cb045c760086d7e8e2 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_17.txt @@ -0,0 +1 @@ +If a package is installed but you don't wish to use the accompanying integration, you can change TrainingArguments.report_to to a list of just those integrations you want to use (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_18.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..12e86fe9613271c87786322f318cb725569cf5db --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_18.txt @@ -0,0 +1 @@ +["azure_ml", "wandb"]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_19.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..f917f78616620cba47f346b6fff98f4279e19961 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_19.txt @@ -0,0 +1 @@ +The main class that implements callbacks is [TrainerCallback]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_20.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2ab28e33c30819c3875802327d79a659d3099f3 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_20.txt @@ -0,0 +1,4 @@ +It gets the +[TrainingArguments] used to instantiate the [Trainer], can access that +Trainer's internal state via [TrainerState], and can take some actions on the training loop via +[TrainerControl]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_21.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a20a687f3c39d0345dfe9ad77dc89fef4a41ced --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_21.txt @@ -0,0 +1,49 @@ +Available Callbacks +Here is the list of the available [TrainerCallback] in the library: +[[autodoc]] integrations.CometCallback + - setup +[[autodoc]] DefaultFlowCallback +[[autodoc]] PrinterCallback +[[autodoc]] ProgressCallback +[[autodoc]] EarlyStoppingCallback +[[autodoc]] integrations.TensorBoardCallback +[[autodoc]] integrations.WandbCallback + - setup +[[autodoc]] integrations.MLflowCallback + - setup +[[autodoc]] integrations.AzureMLCallback +[[autodoc]] integrations.CodeCarbonCallback +[[autodoc]] integrations.NeptuneCallback +[[autodoc]] integrations.ClearMLCallback +[[autodoc]] integrations.DagsHubCallback +[[autodoc]] integrations.FlyteCallback +[[autodoc]] integrations.DVCLiveCallback + - setup +TrainerCallback +[[autodoc]] TrainerCallback +Here is an example of how to register a custom callback with the PyTorch [Trainer]: +thon +class MyCallback(TrainerCallback): + "A callback that prints a message at the beginning of training" +def on_train_begin(self, args, state, control, **kwargs): + print("Starting training") + +trainer = Trainer( + model, + args, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + callbacks=[MyCallback], # We can either pass the callback class this way or an instance of it (MyCallback()) +) + +Another way to register a callback is to call trainer.add_callback() as follows: +thon +trainer = Trainer() +trainer.add_callback(MyCallback) +Alternatively, we can pass an instance of the callback class +trainer.add_callback(MyCallback()) + +TrainerState +[[autodoc]] TrainerState +TrainerControl +[[autodoc]] TrainerControl \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_4.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4c792da90de009a42990fe446b92b42b1f1b8a7 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_4.txt @@ -0,0 +1 @@ +[DefaultFlowCallback] which handles the default behavior for logging, saving and evaluation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_5.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbaee06ab6f23634e6432f29e9049f7ea35b2f66 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_5.txt @@ -0,0 +1,3 @@ +[PrinterCallback] or [ProgressCallback] to display progress and print the + logs (the first one is used if you deactivate tqdm through the [TrainingArguments], otherwise + it's the second one). \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_6.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..3605c93fe261b78020c225f1d568b5f5b7e6d76d --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_6.txt @@ -0,0 +1,2 @@ +[~integrations.TensorBoardCallback] if tensorboard is accessible (either through PyTorch >= 1.4 + or tensorboardX). \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_7.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cfdde642cc068b901ec36256cf21c2fd2046c0d --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_7.txt @@ -0,0 +1 @@ +[~integrations.WandbCallback] if wandb is installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_8.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5957fa8bf9d5ffaddeaee336995d55edd638f5d --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_8.txt @@ -0,0 +1 @@ +[~integrations.CometCallback] if comet_ml is installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_callback/chunk_9.txt b/chunked/content_aware_chunking/main_classes_callback/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..0748871973fe2ea7d75a0411455dfa195e211de5 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_callback/chunk_9.txt @@ -0,0 +1 @@ +[~integrations.MLflowCallback] if mlflow is installed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_configuration/chunk_2.txt b/chunked/content_aware_chunking/main_classes_configuration/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f0961ef2457e8da232e0149cf9346d1b79684c2 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_configuration/chunk_2.txt @@ -0,0 +1,2 @@ +Common attributes present in all config classes are: +hidden_size, num_attention_heads, and num_hidden_layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_configuration/chunk_3.txt b/chunked/content_aware_chunking/main_classes_configuration/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0b7227abef113d1692d24f9fa26b3b48d933f1e --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_configuration/chunk_3.txt @@ -0,0 +1,2 @@ +Text models further implement: +vocab_size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_configuration/chunk_4.txt b/chunked/content_aware_chunking/main_classes_configuration/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..4538e1d776188eee3004a9b124255da090a3160d --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_configuration/chunk_4.txt @@ -0,0 +1,4 @@ +PretrainedConfig +[[autodoc]] PretrainedConfig + - push_to_hub + - all \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_data_collator/chunk_2.txt b/chunked/content_aware_chunking/main_classes_data_collator/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec240192b5cede45ea01c96567cf157ed7863335 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_data_collator/chunk_2.txt @@ -0,0 +1 @@ +To be able to build batches, data collators may apply some processing (like padding). \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_data_collator/chunk_3.txt b/chunked/content_aware_chunking/main_classes_data_collator/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..06ae9e4956200462273926905176d75eb60120b2 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_data_collator/chunk_3.txt @@ -0,0 +1,3 @@ +Some of them (like +[DataCollatorForLanguageModeling]) also apply some random data augmentation (like random masking) +on the formed batch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_data_collator/chunk_4.txt b/chunked/content_aware_chunking/main_classes_data_collator/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c6a392201806c774cf389737fceac2ca25d6b5d --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_data_collator/chunk_4.txt @@ -0,0 +1 @@ +Examples of use can be found in the example scripts or example notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_data_collator/chunk_5.txt b/chunked/content_aware_chunking/main_classes_data_collator/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..af53d2f769858d50ee53893934e284d010e7a26f --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_data_collator/chunk_5.txt @@ -0,0 +1,25 @@ +Default data collator +[[autodoc]] data.data_collator.default_data_collator +DefaultDataCollator +[[autodoc]] data.data_collator.DefaultDataCollator +DataCollatorWithPadding +[[autodoc]] data.data_collator.DataCollatorWithPadding +DataCollatorForTokenClassification +[[autodoc]] data.data_collator.DataCollatorForTokenClassification +DataCollatorForSeq2Seq +[[autodoc]] data.data_collator.DataCollatorForSeq2Seq +DataCollatorForLanguageModeling +[[autodoc]] data.data_collator.DataCollatorForLanguageModeling + - numpy_mask_tokens + - tf_mask_tokens + - torch_mask_tokens +DataCollatorForWholeWordMask +[[autodoc]] data.data_collator.DataCollatorForWholeWordMask + - numpy_mask_tokens + - tf_mask_tokens + - torch_mask_tokens +DataCollatorForPermutationLanguageModeling +[[autodoc]] data.data_collator.DataCollatorForPermutationLanguageModeling + - numpy_mask_tokens + - tf_mask_tokens + - torch_mask_tokens \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_deepspeed/chunk_2.txt b/chunked/content_aware_chunking/main_classes_deepspeed/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d84a2b6c42478e44f27c72726723d89d6490229 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_deepspeed/chunk_2.txt @@ -0,0 +1 @@ +DeepSpeed is integrated with the [Trainer] class and most of the setup is automatically taken care of for you. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_deepspeed/chunk_3.txt b/chunked/content_aware_chunking/main_classes_deepspeed/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbd5f69a01e25f58bd9dc820b90d6b15eed41898 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_deepspeed/chunk_3.txt @@ -0,0 +1 @@ +However, if you want to use DeepSpeed without the [Trainer], Transformers provides a [HfDeepSpeedConfig] class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_deepspeed/chunk_4.txt b/chunked/content_aware_chunking/main_classes_deepspeed/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..797f64e2d334d9fed4b1ed251af1fe1957c3b74b --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_deepspeed/chunk_4.txt @@ -0,0 +1 @@ +Learn more about using DeepSpeed with [Trainer] in the DeepSpeed guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_deepspeed/chunk_5.txt b/chunked/content_aware_chunking/main_classes_deepspeed/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ac597c8a53eea504ee6ff4c72f66d20b5eb3328 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_deepspeed/chunk_5.txt @@ -0,0 +1,3 @@ +HfDeepSpeedConfig +[[autodoc]] integrations.HfDeepSpeedConfig + - all \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_feature_extractor/chunk_2.txt b/chunked/content_aware_chunking/main_classes_feature_extractor/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..0cc1505f466e832a69ac22eb5637d06cd46cef56 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_feature_extractor/chunk_2.txt @@ -0,0 +1,11 @@ +FeatureExtractionMixin +[[autodoc]] feature_extraction_utils.FeatureExtractionMixin + - from_pretrained + - save_pretrained +SequenceFeatureExtractor +[[autodoc]] SequenceFeatureExtractor + - pad +BatchFeature +[[autodoc]] BatchFeature +ImageFeatureExtractionMixin +[[autodoc]] image_utils.ImageFeatureExtractionMixin \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_image_processor/chunk_2.txt b/chunked/content_aware_chunking/main_classes_image_processor/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..a438d125de76a00da09e53a8fa074d46d183096b --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_image_processor/chunk_2.txt @@ -0,0 +1 @@ +It may also include model specific post-processing such as converting logits to segmentation masks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_image_processor/chunk_3.txt b/chunked/content_aware_chunking/main_classes_image_processor/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..301e40d9f3f00446294d9be9f35b9b24d0d0cac4 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_image_processor/chunk_3.txt @@ -0,0 +1,8 @@ +ImageProcessingMixin +[[autodoc]] image_processing_utils.ImageProcessingMixin + - from_pretrained + - save_pretrained +BatchFeature +[[autodoc]] BatchFeature +BaseImageProcessor +[[autodoc]] image_processing_utils.BaseImageProcessor \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_10.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b387f0b27a20b843ed0ab5d6384f4745883a5e2 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_10.txt @@ -0,0 +1 @@ +transformers.logging.ERROR (int value, 40): only report errors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_11.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..26a6659ae6f2964a2fc26bd762be79e424ea5021 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_11.txt @@ -0,0 +1,2 @@ +transformers.logging.WARNING or transformers.logging.WARN (int value, 30): only reports error and + warnings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_12.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..29cdb3840e490a04c9957835e5dfef4ed808e89a --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_12.txt @@ -0,0 +1 @@ +This the default level used by the library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_13.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..33016c50c0591ec4c2ca1dc84b5a57518c579ff3 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_13.txt @@ -0,0 +1 @@ +transformers.logging.INFO (int value, 20): reports error, warnings and basic information. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_14.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..734419f14bb23dde0eb1dc300d6ddc9d721796f1 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_14.txt @@ -0,0 +1 @@ +transformers.logging.DEBUG (int value, 10): report all information. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_15.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6779c59d45a040b6033bbd1e05a3aee1bfe017ad --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_15.txt @@ -0,0 +1 @@ +By default, tqdm progress bars will be displayed during model download. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_16.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..c24193cf9f485289542c7bde345fe83c33fb3995 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_16.txt @@ -0,0 +1 @@ +[logging.disable_progress_bar] and [logging.enable_progress_bar] can be used to suppress or unsuppress this behavior. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_17.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ba091aa7b846fddcbb0b2dda2021365cc0311ee --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_17.txt @@ -0,0 +1,4 @@ +logging vs warnings +Python has two logging systems that are often used in conjunction: logging, which is explained above, and warnings, +which allows further classification of warnings in specific buckets, e.g., FutureWarning for a feature or path +that has already been deprecated and DeprecationWarning to indicate an upcoming deprecation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_18.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..b18552a72f12b34c85d3222716c2b6bf0ca834a2 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_18.txt @@ -0,0 +1 @@ +We use both in the transformers library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_19.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..289b1ad04df38e7152a5479e9164bdd5d86a2884 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_19.txt @@ -0,0 +1,2 @@ +We leverage and adapt logging's captureWarning method to allow +management of these warning messages by the verbosity setters above. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_20.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8b4a111cf705466b8d7f87eeeda24ba07860cc5 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_20.txt @@ -0,0 +1 @@ +What does that mean for developers of the library? \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_21.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc65f5cab1dc26932d0e6f9cd48644b591248669 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_21.txt @@ -0,0 +1,4 @@ +We should respect the following heuristic: +- warnings should be favored for developers of the library and libraries dependent on transformers +- logging should be used for end-users of the library using it in every-day projects +See reference of the captureWarnings method below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_22.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..412df5245797e4c0b97336832f83c0c305c3b5b0 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_22.txt @@ -0,0 +1,16 @@ +[[autodoc]] logging.captureWarnings +Base setters +[[autodoc]] logging.set_verbosity_error +[[autodoc]] logging.set_verbosity_warning +[[autodoc]] logging.set_verbosity_info +[[autodoc]] logging.set_verbosity_debug +Other functions +[[autodoc]] logging.get_verbosity +[[autodoc]] logging.set_verbosity +[[autodoc]] logging.get_logger +[[autodoc]] logging.enable_default_handler +[[autodoc]] logging.disable_default_handler +[[autodoc]] logging.enable_explicit_format +[[autodoc]] logging.reset_format +[[autodoc]] logging.enable_progress_bar +[[autodoc]] logging.disable_progress_bar \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_7.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8824c63744bad009fffc52c3c9f51a833951f68 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_7.txt @@ -0,0 +1,2 @@ +This will disable any warning that is logged using +[logger.warning_advice]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_8.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..8268d446929a099864c7a28ed95effc84ea2037e --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_8.txt @@ -0,0 +1,14 @@ +For example: + +TRANSFORMERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py +Here is an example of how to use the same logger as the library in your own module or script: +thon +from transformers.utils import logging +logging.set_verbosity_info() +logger = logging.get_logger("transformers") +logger.info("INFO") +logger.warning("WARN") + +All the methods of this logging module are documented below, the main ones are +[logging.get_verbosity] to get the current level of verbosity in the logger and +[logging.set_verbosity] to set the verbosity to the level of your choice. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_logging/chunk_9.txt b/chunked/content_aware_chunking/main_classes_logging/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e25ca51e90a032b49566b59799f775893776c61 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_logging/chunk_9.txt @@ -0,0 +1,5 @@ +In order (from the least +verbose to the most verbose), those levels (with their corresponding int values in parenthesis) are: + +transformers.logging.CRITICAL or transformers.logging.FATAL (int value, 50): only report the most + critical errors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_model/chunk_10.txt b/chunked/content_aware_chunking/main_classes_model/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..420c1994b09ba59eb290279c87c9c4ce219e8052 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_model/chunk_10.txt @@ -0,0 +1 @@ +With device_map="auto", Accelerate will determine where to put each layer to maximize the use of your fastest devices (GPUs) and offload the rest on the CPU, or even the hard drive if you don't have enough GPU RAM (or CPU RAM). \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_model/chunk_11.txt b/chunked/content_aware_chunking/main_classes_model/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..35c6c2dcceb0261587fac700b8430b1198ebab10 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_model/chunk_11.txt @@ -0,0 +1 @@ +Even if the model is split across several devices, it will run as you would normally expect. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_model/chunk_12.txt b/chunked/content_aware_chunking/main_classes_model/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b4fd55f2100d0d60fc51b87268c776071d46ed0 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_model/chunk_12.txt @@ -0,0 +1,40 @@ +When passing a device_map, low_cpu_mem_usage is automatically set to True, so you don't need to specify it: + +from transformers import AutoModelForSeq2SeqLM +t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", device_map="auto") + +You can inspect how the model was split across devices by looking at its hf_device_map attribute: +py +t0pp.hf_device_map +python out +{'shared': 0, + 'decoder.embed_tokens': 0, + 'encoder': 0, + 'decoder.block.0': 0, + 'decoder.block.1': 1, + 'decoder.block.2': 1, + 'decoder.block.3': 1, + 'decoder.block.4': 1, + 'decoder.block.5': 1, + 'decoder.block.6': 1, + 'decoder.block.7': 1, + 'decoder.block.8': 1, + 'decoder.block.9': 1, + 'decoder.block.10': 1, + 'decoder.block.11': 1, + 'decoder.block.12': 1, + 'decoder.block.13': 1, + 'decoder.block.14': 1, + 'decoder.block.15': 1, + 'decoder.block.16': 1, + 'decoder.block.17': 1, + 'decoder.block.18': 1, + 'decoder.block.19': 1, + 'decoder.block.20': 1, + 'decoder.block.21': 1, + 'decoder.block.22': 'cpu', + 'decoder.block.23': 'cpu', + 'decoder.final_layer_norm': 'cpu', + 'decoder.dropout': 'cpu', + 'lm_head': 'cpu'} +You can also write your own device map following the same format (a dictionary layer name to device). \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_model/chunk_13.txt b/chunked/content_aware_chunking/main_classes_model/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fdb08ad4848aa70fa4e751995e1985925064764 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_model/chunk_13.txt @@ -0,0 +1 @@ +It should map all parameters of the model to a given device, but you don't have to detail where all the submodules of one layer go if that layer is entirely on the same device. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_model/chunk_14.txt b/chunked/content_aware_chunking/main_classes_model/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..f21051892b33c072af863ae1a92997e680886175 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_model/chunk_14.txt @@ -0,0 +1,4 @@ +For instance, the following device map would work properly for T0pp (as long as you have the GPU memory): +python +device_map = {"shared": 0, "encoder": 0, "decoder": 1, "lm_head": 1} +Another way to minimize the memory impact of your model is to instantiate it at a lower precision dtype (like torch.float16) or use direct quantization techniques as described below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_model/chunk_15.txt b/chunked/content_aware_chunking/main_classes_model/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7cac9ba7ea6b2ac957bd8b8e5288d50ec7e84f8 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_model/chunk_15.txt @@ -0,0 +1,2 @@ +Model Instantiation dtype +Under Pytorch a model normally gets instantiated with torch.float32 format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_model/chunk_16.txt b/chunked/content_aware_chunking/main_classes_model/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfd4bdd8ee48241dc233aea3a60aba08446324bd --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_model/chunk_16.txt @@ -0,0 +1,2 @@ +This can be an issue if one tries to +load a model whose weights are in fp16, since it'd require twice as much memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_model/chunk_17.txt b/chunked/content_aware_chunking/main_classes_model/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..86024adb1c34b702e57cfa71eebdecca97205672 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_model/chunk_17.txt @@ -0,0 +1,13 @@ +To overcome this limitation, you can +either explicitly pass the desired dtype using torch_dtype argument: +python +model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype=torch.float16) +or, if you want the model to always load in the most optimal memory pattern, you can use the special value "auto", +and then dtype will be automatically derived from the model's weights: +python +model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype="auto") +Models instantiated from scratch can also be told which dtype to use with: +python +config = T5Config.from_pretrained("t5") +model = AutoModel.from_config(config) +Due to Pytorch design, this functionality is only available for floating dtypes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_model/chunk_18.txt b/chunked/content_aware_chunking/main_classes_model/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd31d889085cf434375c8816e5aaa6228fbe205b --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_model/chunk_18.txt @@ -0,0 +1,16 @@ +ModuleUtilsMixin +[[autodoc]] modeling_utils.ModuleUtilsMixin +TFPreTrainedModel +[[autodoc]] TFPreTrainedModel + - push_to_hub + - all +TFModelUtilsMixin +[[autodoc]] modeling_tf_utils.TFModelUtilsMixin +FlaxPreTrainedModel +[[autodoc]] FlaxPreTrainedModel + - push_to_hub + - all +Pushing to the Hub +[[autodoc]] utils.PushToHubMixin +Sharded checkpoints +[[autodoc]] modeling_utils.load_sharded_checkpoint \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_onnx/chunk_2.txt b/chunked/content_aware_chunking/main_classes_onnx/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c8c007786a8d61ef16cb89c1105abc8965d652c --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_onnx/chunk_2.txt @@ -0,0 +1,17 @@ +ONNX Configurations +We provide three abstract classes that you should inherit from, depending on the +type of model architecture you wish to export: + +Encoder-based models inherit from [~onnx.config.OnnxConfig] +Decoder-based models inherit from [~onnx.config.OnnxConfigWithPast] +Encoder-decoder models inherit from [~onnx.config.OnnxSeq2SeqConfigWithPast] + +OnnxConfig +[[autodoc]] onnx.config.OnnxConfig +OnnxConfigWithPast +[[autodoc]] onnx.config.OnnxConfigWithPast +OnnxSeq2SeqConfigWithPast +[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast +ONNX Features +Each ONNX configuration is associated with a set of features that enable you +to export models for different types of topologies or tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_onnx/chunk_3.txt b/chunked/content_aware_chunking/main_classes_onnx/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a6a310edae2675eeab7d3011b87e16571814927 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_onnx/chunk_3.txt @@ -0,0 +1,2 @@ +FeaturesManager +[[autodoc]] onnx.features.FeaturesManager \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_output/chunk_10.txt b/chunked/content_aware_chunking/main_classes_output/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..485ae450511fa8b0eba87f021fca109bfb048080 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_output/chunk_10.txt @@ -0,0 +1,4 @@ +Here for instance, it has two elements, loss then logits, so +python +outputs[:2] +will return the tuple (outputs.loss, outputs.logits) for instance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_output/chunk_11.txt b/chunked/content_aware_chunking/main_classes_output/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b19c6a4a90ab8464ac965f7c0958eb576945ada --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_output/chunk_11.txt @@ -0,0 +1,2 @@ +When considering our outputs object as dictionary, it only considers the attributes that don't have None +values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_output/chunk_12.txt b/chunked/content_aware_chunking/main_classes_output/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..7264e747c8df30f9049245165db50969390b5ed6 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_output/chunk_12.txt @@ -0,0 +1 @@ +Here for instance, it has two keys that are loss and logits. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_output/chunk_13.txt b/chunked/content_aware_chunking/main_classes_output/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..28772ab0ee214107694ca76e509047730b009f09 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_output/chunk_13.txt @@ -0,0 +1 @@ +We document here the generic model outputs that are used by more than one model type. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_output/chunk_14.txt b/chunked/content_aware_chunking/main_classes_output/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb836e00ad5d5a6c5c2614bf70579a17171d5cf1 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_output/chunk_14.txt @@ -0,0 +1,2 @@ +Specific output types are +documented on their corresponding model page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_output/chunk_15.txt b/chunked/content_aware_chunking/main_classes_output/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..340380dfd27d9b9e1ce74835304b1b447e9b39b5 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_output/chunk_15.txt @@ -0,0 +1,127 @@ +ModelOutput +[[autodoc]] utils.ModelOutput + - to_tuple +BaseModelOutput +[[autodoc]] modeling_outputs.BaseModelOutput +BaseModelOutputWithPooling +[[autodoc]] modeling_outputs.BaseModelOutputWithPooling +BaseModelOutputWithCrossAttentions +[[autodoc]] modeling_outputs.BaseModelOutputWithCrossAttentions +BaseModelOutputWithPoolingAndCrossAttentions +[[autodoc]] modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions +BaseModelOutputWithPast +[[autodoc]] modeling_outputs.BaseModelOutputWithPast +BaseModelOutputWithPastAndCrossAttentions +[[autodoc]] modeling_outputs.BaseModelOutputWithPastAndCrossAttentions +Seq2SeqModelOutput +[[autodoc]] modeling_outputs.Seq2SeqModelOutput +CausalLMOutput +[[autodoc]] modeling_outputs.CausalLMOutput +CausalLMOutputWithCrossAttentions +[[autodoc]] modeling_outputs.CausalLMOutputWithCrossAttentions +CausalLMOutputWithPast +[[autodoc]] modeling_outputs.CausalLMOutputWithPast +MaskedLMOutput +[[autodoc]] modeling_outputs.MaskedLMOutput +Seq2SeqLMOutput +[[autodoc]] modeling_outputs.Seq2SeqLMOutput +NextSentencePredictorOutput +[[autodoc]] modeling_outputs.NextSentencePredictorOutput +SequenceClassifierOutput +[[autodoc]] modeling_outputs.SequenceClassifierOutput +Seq2SeqSequenceClassifierOutput +[[autodoc]] modeling_outputs.Seq2SeqSequenceClassifierOutput +MultipleChoiceModelOutput +[[autodoc]] modeling_outputs.MultipleChoiceModelOutput +TokenClassifierOutput +[[autodoc]] modeling_outputs.TokenClassifierOutput +QuestionAnsweringModelOutput +[[autodoc]] modeling_outputs.QuestionAnsweringModelOutput +Seq2SeqQuestionAnsweringModelOutput +[[autodoc]] modeling_outputs.Seq2SeqQuestionAnsweringModelOutput +Seq2SeqSpectrogramOutput +[[autodoc]] modeling_outputs.Seq2SeqSpectrogramOutput +SemanticSegmenterOutput +[[autodoc]] modeling_outputs.SemanticSegmenterOutput +ImageClassifierOutput +[[autodoc]] modeling_outputs.ImageClassifierOutput +ImageClassifierOutputWithNoAttention +[[autodoc]] modeling_outputs.ImageClassifierOutputWithNoAttention +DepthEstimatorOutput +[[autodoc]] modeling_outputs.DepthEstimatorOutput +Wav2Vec2BaseModelOutput +[[autodoc]] modeling_outputs.Wav2Vec2BaseModelOutput +XVectorOutput +[[autodoc]] modeling_outputs.XVectorOutput +Seq2SeqTSModelOutput +[[autodoc]] modeling_outputs.Seq2SeqTSModelOutput +Seq2SeqTSPredictionOutput +[[autodoc]] modeling_outputs.Seq2SeqTSPredictionOutput +SampleTSPredictionOutput +[[autodoc]] modeling_outputs.SampleTSPredictionOutput +TFBaseModelOutput +[[autodoc]] modeling_tf_outputs.TFBaseModelOutput +TFBaseModelOutputWithPooling +[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPooling +TFBaseModelOutputWithPoolingAndCrossAttentions +[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions +TFBaseModelOutputWithPast +[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPast +TFBaseModelOutputWithPastAndCrossAttentions +[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions +TFSeq2SeqModelOutput +[[autodoc]] modeling_tf_outputs.TFSeq2SeqModelOutput +TFCausalLMOutput +[[autodoc]] modeling_tf_outputs.TFCausalLMOutput +TFCausalLMOutputWithCrossAttentions +[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions +TFCausalLMOutputWithPast +[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithPast +TFMaskedLMOutput +[[autodoc]] modeling_tf_outputs.TFMaskedLMOutput +TFSeq2SeqLMOutput +[[autodoc]] modeling_tf_outputs.TFSeq2SeqLMOutput +TFNextSentencePredictorOutput +[[autodoc]] modeling_tf_outputs.TFNextSentencePredictorOutput +TFSequenceClassifierOutput +[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutput +TFSeq2SeqSequenceClassifierOutput +[[autodoc]] modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput +TFMultipleChoiceModelOutput +[[autodoc]] modeling_tf_outputs.TFMultipleChoiceModelOutput +TFTokenClassifierOutput +[[autodoc]] modeling_tf_outputs.TFTokenClassifierOutput +TFQuestionAnsweringModelOutput +[[autodoc]] modeling_tf_outputs.TFQuestionAnsweringModelOutput +TFSeq2SeqQuestionAnsweringModelOutput +[[autodoc]] modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput +FlaxBaseModelOutput +[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutput +FlaxBaseModelOutputWithPast +[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPast +FlaxBaseModelOutputWithPooling +[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPooling +FlaxBaseModelOutputWithPastAndCrossAttentions +[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions +FlaxSeq2SeqModelOutput +[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqModelOutput +FlaxCausalLMOutputWithCrossAttentions +[[autodoc]] modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions +FlaxMaskedLMOutput +[[autodoc]] modeling_flax_outputs.FlaxMaskedLMOutput +FlaxSeq2SeqLMOutput +[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqLMOutput +FlaxNextSentencePredictorOutput +[[autodoc]] modeling_flax_outputs.FlaxNextSentencePredictorOutput +FlaxSequenceClassifierOutput +[[autodoc]] modeling_flax_outputs.FlaxSequenceClassifierOutput +FlaxSeq2SeqSequenceClassifierOutput +[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput +FlaxMultipleChoiceModelOutput +[[autodoc]] modeling_flax_outputs.FlaxMultipleChoiceModelOutput +FlaxTokenClassifierOutput +[[autodoc]] modeling_flax_outputs.FlaxTokenClassifierOutput +FlaxQuestionAnsweringModelOutput +[[autodoc]] modeling_flax_outputs.FlaxQuestionAnsweringModelOutput +FlaxSeq2SeqQuestionAnsweringModelOutput +[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_output/chunk_7.txt b/chunked/content_aware_chunking/main_classes_output/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..6eb52f4afa8a652a915b2ac84fca7f596af431bc --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_output/chunk_7.txt @@ -0,0 +1,2 @@ +You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you +will get None. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_output/chunk_8.txt b/chunked/content_aware_chunking/main_classes_output/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..e56b96ef63a2763498590d076f427b1c8001ffe7 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_output/chunk_8.txt @@ -0,0 +1,2 @@ +Here for instance outputs.loss is the loss computed by the model, and outputs.attentions is +None. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_output/chunk_9.txt b/chunked/content_aware_chunking/main_classes_output/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..557b8afddf841f669460eb7949500c67e8b5b417 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_output/chunk_9.txt @@ -0,0 +1 @@ +When considering our outputs object as tuple, it only considers the attributes that don't have None values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_pipelines/chunk_16.txt b/chunked/content_aware_chunking/main_classes_pipelines/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3d8581dc4488fd1aba6b8e6ae91b6b9d9846fd1 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_pipelines/chunk_16.txt @@ -0,0 +1,7 @@ +You can still have 1 thread that + # does the preprocessing while the main runs the big inference + yield "This is a test" +for out in pipe(data()): + print(out) + # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"} + # {"text": .} \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_pipelines/chunk_17.txt b/chunked/content_aware_chunking/main_classes_pipelines/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..91c354d2e5fdcf2a059380d488ab2520b2c6d539 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_pipelines/chunk_17.txt @@ -0,0 +1 @@ +# . \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_pipelines/chunk_18.txt b/chunked/content_aware_chunking/main_classes_pipelines/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..866ee656d6b08a7261bdd5311a013ba576a7577e --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_pipelines/chunk_18.txt @@ -0,0 +1,3 @@ +[[autodoc]] pipeline +Pipeline batching +All pipelines can use batching. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_pipelines/chunk_19.txt b/chunked/content_aware_chunking/main_classes_pipelines/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfcf91c32725634a7fb6a6c253b4b6564ebcdcb4 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_pipelines/chunk_19.txt @@ -0,0 +1,2 @@ +This will work +whenever the pipeline uses its streaming ability (so when passing lists or Dataset or generator). \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_pipelines/chunk_20.txt b/chunked/content_aware_chunking/main_classes_pipelines/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c0bede0560852324172e393ddc25c137a6a6bee --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_pipelines/chunk_20.txt @@ -0,0 +1,13 @@ +thon +from transformers import pipeline +from transformers.pipelines.pt_utils import KeyDataset +import datasets +dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised") +pipe = pipeline("text-classification", device=0) +for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"): + print(out) + # [{'label': 'POSITIVE', 'score': 0.9998743534088135}] + # Exactly the same output as before, but the content are passed + # as batches to the model + +However, this is not automatically a win for performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_pipelines/chunk_21.txt b/chunked/content_aware_chunking/main_classes_pipelines/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..48f80835e8684e7a57ee4331dd979c2400614761 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_pipelines/chunk_21.txt @@ -0,0 +1,2 @@ +It can be either a 10x speedup or 5x slowdown depending +on hardware, data and the actual model being used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_pipelines/chunk_22.txt b/chunked/content_aware_chunking/main_classes_pipelines/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..f35e7b9a1c0b2017d8a0376b48e0488868575fea --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_pipelines/chunk_22.txt @@ -0,0 +1,48 @@ +Example where it's mostly a speedup: + +thon +from transformers import pipeline +from torch.utils.data import Dataset +from tqdm.auto import tqdm +pipe = pipeline("text-classification", device=0) +class MyDataset(Dataset): + def len(self): + return 5000 +def __getitem__(self, i): + return "This is a test" + +dataset = MyDataset() +for batch_size in [1, 8, 64, 256]: + print("-" * 30) + print(f"Streaming batch_size={batch_size}") + for out in tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)): + pass + +On GTX 970 + +Streaming no batching +100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s] + +Streaming batch_size=8 +100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s] + +Streaming batch_size=64 +100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s] + +Streaming batch_size=256 +100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s] +(diminishing returns, saturated the GPU) + +Example where it's most a slowdown: +thon +class MyDataset(Dataset): + def len(self): + return 5000 +def __getitem__(self, i): + if i % 64 == 0: + n = 100 + else: + n = 1 + return "This is a test" * n + +This is a occasional very long sentence compared to the other. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_pipelines/chunk_23.txt b/chunked/content_aware_chunking/main_classes_pipelines/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a2bf327dfc480b0930fd6bc51570d71fcf42005 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_pipelines/chunk_23.txt @@ -0,0 +1,2 @@ +In that case, the whole batch will need to be 400 +tokens long, so the whole batch will be [64, 400] instead of [64, 4], leading to the high slowdown. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_pipelines/chunk_24.txt b/chunked/content_aware_chunking/main_classes_pipelines/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..a14c9b2914adafdc26b33609eaf55fa6cab70117 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_pipelines/chunk_24.txt @@ -0,0 +1,2 @@ +Even worse, on +bigger batches, the program simply crashes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/main_classes_pipelines/chunk_25.txt b/chunked/content_aware_chunking/main_classes_pipelines/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..7053c98bd8b81ed63e0926481c942026447edf29 --- /dev/null +++ b/chunked/content_aware_chunking/main_classes_pipelines/chunk_25.txt @@ -0,0 +1,15 @@ +Streaming no batching +100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s] + +Streaming batch_size=8 +100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s] + +Streaming batch_size=64 +100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s] + +Streaming batch_size=256 + 0%| | 0/1000 [00:00> E. Also, the embedding matrix is large since it's V x E (V being the vocab size). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_15.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..60e0440e845bc4967a0e2e8a22536c51ec5169a3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_15.txt @@ -0,0 +1 @@ +If E < H, it has less parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_16.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdeae8de72ff922852fd9a1a6fdfdc04a46ea7f7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_16.txt @@ -0,0 +1 @@ +Layers are split in groups that share parameters (to save memory). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_17.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dd6e72a42abe1169f1f415bc6e87e9f279dd0b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_17.txt @@ -0,0 +1 @@ +Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and B (that are consecutive) and we either feed A followed by B or B followed by A. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_18.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e66b264aa94fd1a9765bba81fc70ff75eeb9b4a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_18.txt @@ -0,0 +1 @@ +The model must predict if they have been swapped or not. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_19.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..bba0214afcc802be2c905f184f5440f08eb91b3a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_19.txt @@ -0,0 +1 @@ +This model was contributed by lysandre. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_20.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fd8cc17999e04b1035e109cdafc2003f9b11bb8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_20.txt @@ -0,0 +1,2 @@ +This model jax version was contributed by +kamalkraj. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_21.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_21.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_22.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..28198422c301b9a3937ce3f78f2347eaa1e71bd1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_22.txt @@ -0,0 +1,2 @@ +Resources +The resources provided in the following sections consist of a list of official Hugging Face and community (indicated by 🌎) resources to help you get started with AlBERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_23.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_23.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_24.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_24.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_25.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..14d67008f54c9773087fd537969e08d44808cb7f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_25.txt @@ -0,0 +1 @@ +[AlbertForSequenceClassification] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_26.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..515d6f0f7c3e46ba05f03c9c3fb81bcd2c5c62df --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_26.txt @@ -0,0 +1 @@ +[TFAlbertForSequenceClassification] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_27.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..49940ee64b2347f246b2ee08ac51f9806aa35597 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_27.txt @@ -0,0 +1 @@ +[FlaxAlbertForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_28.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2a02220ce4ccf3f6e77c3d37331aef1dd945bdd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_28.txt @@ -0,0 +1 @@ +Check the Text classification task guide on how to use the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_29.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..8920b98f473601ba765152cb4a94a015f2af1ae4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_29.txt @@ -0,0 +1 @@ +[AlbertForTokenClassification] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_30.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dd20a8e6769c04d0cc8391f1fe18ef10cd36c2e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_30.txt @@ -0,0 +1 @@ +[TFAlbertForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_31.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dbcb7d9f03e9266dab6a7fa6a1b7671e910ad93 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_31.txt @@ -0,0 +1 @@ +[FlaxAlbertForTokenClassification] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_32.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..69e21faf2c5098fb807509f480ff122a6a2859c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_32.txt @@ -0,0 +1 @@ +Token classification chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_33.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f961d07bfc3d02d4e1407265f3d4eaab1ad20ca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_33.txt @@ -0,0 +1 @@ +Check the Token classification task guide on how to use the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_34.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..671d1f5976124229d2aa5b2717c17bd567613cae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_34.txt @@ -0,0 +1 @@ +[AlbertForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_35.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..8641e59a9a9f78fe90d1249b87204367e2d455d8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_35.txt @@ -0,0 +1 @@ +[TFAlbertForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_36.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..5bfc63c7f41d07d5ed5bf3a739983bf1762addbe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_36.txt @@ -0,0 +1 @@ +[FlaxAlbertForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_37.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f2b5fefece97efd08b6147d0c598a5443817bec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_37.txt @@ -0,0 +1 @@ +Masked language modeling chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_38.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a9f2a2fe11b253c6818c96c60f8f9e951a0abee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_38.txt @@ -0,0 +1 @@ +Check the Masked language modeling task guide on how to use the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_39.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1c9daf442271f653ae54e8cd82bb9a2ff330708 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_39.txt @@ -0,0 +1 @@ +[AlbertForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_40.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebcb1714c5977b180a9bdcecab3ec9a4c0e79c96 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_40.txt @@ -0,0 +1 @@ +[TFAlbertForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_41.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab4b65ce6cfb232d8afe54dabc2a3e333e09c644 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_41.txt @@ -0,0 +1 @@ +[FlaxAlbertForQuestionAnswering] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_42.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..38996d3f4fef4d6454d1d2c12acfb05d3bf81ec8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_42.txt @@ -0,0 +1 @@ +Question answering chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_43.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..741afc10a28aa4181cae897ab68cc3850c219204 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_43.txt @@ -0,0 +1 @@ +Check the Question answering task guide on how to use the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_44.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ad2926ded4f448bb5aa23ce2518992e65dc89c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_44.txt @@ -0,0 +1,3 @@ +Multiple choice + +[AlbertForMultipleChoice] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_45.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc3dfe44b336b7acb049c11c10b361fd989e15b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_45.txt @@ -0,0 +1 @@ +[TFAlbertForMultipleChoice] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_46.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ad5fa2091f5a96f31f96a13d5680c41d3d7aeb1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_46.txt @@ -0,0 +1 @@ +Check the Multiple choice task guide on how to use the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_47.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6b914bd0e48305de02ddafbe67202dd4a24e5fd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_47.txt @@ -0,0 +1,78 @@ +AlbertConfig +[[autodoc]] AlbertConfig +AlbertTokenizer +[[autodoc]] AlbertTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +AlbertTokenizerFast +[[autodoc]] AlbertTokenizerFast +Albert specific outputs +[[autodoc]] models.albert.modeling_albert.AlbertForPreTrainingOutput +[[autodoc]] models.albert.modeling_tf_albert.TFAlbertForPreTrainingOutput + +AlbertModel +[[autodoc]] AlbertModel + - forward +AlbertForPreTraining +[[autodoc]] AlbertForPreTraining + - forward +AlbertForMaskedLM +[[autodoc]] AlbertForMaskedLM + - forward +AlbertForSequenceClassification +[[autodoc]] AlbertForSequenceClassification + - forward +AlbertForMultipleChoice +[[autodoc]] AlbertForMultipleChoice +AlbertForTokenClassification +[[autodoc]] AlbertForTokenClassification + - forward +AlbertForQuestionAnswering +[[autodoc]] AlbertForQuestionAnswering + - forward + +TFAlbertModel +[[autodoc]] TFAlbertModel + - call +TFAlbertForPreTraining +[[autodoc]] TFAlbertForPreTraining + - call +TFAlbertForMaskedLM +[[autodoc]] TFAlbertForMaskedLM + - call +TFAlbertForSequenceClassification +[[autodoc]] TFAlbertForSequenceClassification + - call +TFAlbertForMultipleChoice +[[autodoc]] TFAlbertForMultipleChoice + - call +TFAlbertForTokenClassification +[[autodoc]] TFAlbertForTokenClassification + - call +TFAlbertForQuestionAnswering +[[autodoc]] TFAlbertForQuestionAnswering + - call + +FlaxAlbertModel +[[autodoc]] FlaxAlbertModel + - call +FlaxAlbertForPreTraining +[[autodoc]] FlaxAlbertForPreTraining + - call +FlaxAlbertForMaskedLM +[[autodoc]] FlaxAlbertForMaskedLM + - call +FlaxAlbertForSequenceClassification +[[autodoc]] FlaxAlbertForSequenceClassification + - call +FlaxAlbertForMultipleChoice +[[autodoc]] FlaxAlbertForMultipleChoice + - call +FlaxAlbertForTokenClassification +[[autodoc]] FlaxAlbertForTokenClassification + - call +FlaxAlbertForQuestionAnswering +[[autodoc]] FlaxAlbertForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d9decfd3f66e25ea4f8272e3c790dd9c50422c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_8.txt @@ -0,0 +1,2 @@ +As a result, our best model establishes new state-of-the-art results on the GLUE, RACE, and +SQuAD benchmarks while having fewer parameters compared to BERT-large. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_albert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_albert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..bba0214afcc802be2c905f184f5440f08eb91b3a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_albert/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by lysandre. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_10.txt b/chunked/content_aware_chunking/model_doc_align/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..51ccdc5cd6a600968999ad2daf12ed93b6596bcb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_10.txt @@ -0,0 +1 @@ +In this paper, we leverage a noisy dataset of over one billion image alt-text pairs, obtained without expensive filtering or post-processing steps in the Conceptual Captions dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_11.txt b/chunked/content_aware_chunking/model_doc_align/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..b03fc83aa86c89e6ea8f43a458a7562cbff9313c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_11.txt @@ -0,0 +1 @@ +A simple dual-encoder architecture learns to align visual and language representations of the image and text pairs using a contrastive loss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_12.txt b/chunked/content_aware_chunking/model_doc_align/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..06cb295f9d0076c5c492a38a72d4fb510bdd2a8e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_12.txt @@ -0,0 +1 @@ +We show that the scale of our corpus can make up for its noise and leads to state-of-the-art representations even with such a simple learning scheme. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_13.txt b/chunked/content_aware_chunking/model_doc_align/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b95a468c7bd2bdd6591d995f39a249a4505c129 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_13.txt @@ -0,0 +1 @@ +Our visual representation achieves strong performance when transferred to classification tasks such as ImageNet and VTAB. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_14.txt b/chunked/content_aware_chunking/model_doc_align/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca944d1f6998661e0443116300558e572b35bde8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_14.txt @@ -0,0 +1 @@ +The aligned visual and language representations enables zero-shot image classification and also set new state-of-the-art results on Flickr30K and MSCOCO image-text retrieval benchmarks, even when compared with more sophisticated cross-attention models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_15.txt b/chunked/content_aware_chunking/model_doc_align/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae73307dc2179a7ef6a67619009eec00503a9820 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_15.txt @@ -0,0 +1 @@ +The representations also enable cross-modality search with complex text and text + image queries. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_16.txt b/chunked/content_aware_chunking/model_doc_align/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c080f2d1da6d833d49b77fc425bcc9455dd287b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_16.txt @@ -0,0 +1 @@ +This model was contributed by Alara Dirik. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_17.txt b/chunked/content_aware_chunking/model_doc_align/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f82a42b024b27ab48283c639d868ac85418bf3f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_17.txt @@ -0,0 +1 @@ +The original code is not released, this implementation is based on the Kakao Brain implementation based on the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_18.txt b/chunked/content_aware_chunking/model_doc_align/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8ff336d5292c7f7e204c507f6da1a4e376bea66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_18.txt @@ -0,0 +1,2 @@ +Usage example +ALIGN uses EfficientNet to get visual features and BERT to get the text features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_19.txt b/chunked/content_aware_chunking/model_doc_align/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab29347a1e9ff8b418a939d0435bc5920189e304 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_19.txt @@ -0,0 +1 @@ +Both the text and visual features are then projected to a latent space with identical dimension. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_20.txt b/chunked/content_aware_chunking/model_doc_align/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..0df25eb9843da9e3dc1958635185a8a878b86e82 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_20.txt @@ -0,0 +1 @@ +The dot product between the projected image and text features is then used as a similarity score. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_21.txt b/chunked/content_aware_chunking/model_doc_align/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4e66704b274ecd69a7f4afc6b7a77a21c760b80 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_21.txt @@ -0,0 +1 @@ +[AlignProcessor] wraps [EfficientNetImageProcessor] and [BertTokenizer] into a single instance to both encode the text and preprocess the images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_22.txt b/chunked/content_aware_chunking/model_doc_align/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d5f3bc4fa23ba8e43e6aa1c4ef6e3c3f2e522b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_22.txt @@ -0,0 +1 @@ +The following example shows how to get the image-text similarity scores using [AlignProcessor] and [AlignModel]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_23.txt b/chunked/content_aware_chunking/model_doc_align/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fa1f0610dc16657ad72f1373f425911c9c0bc07 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_23.txt @@ -0,0 +1,21 @@ +thon +import requests +import torch +from PIL import Image +from transformers import AlignProcessor, AlignModel +processor = AlignProcessor.from_pretrained("kakaobrain/align-base") +model = AlignModel.from_pretrained("kakaobrain/align-base") +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +candidate_labels = ["an image of a cat", "an image of a dog"] +inputs = processor(text=candidate_labels, images=image, return_tensors="pt") +with torch.no_grad(): + outputs = model(**inputs) +this is the image-text similarity score +logits_per_image = outputs.logits_per_image +we can take the softmax to get the label probabilities +probs = logits_per_image.softmax(dim=1) +print(probs) + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ALIGN. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_24.txt b/chunked/content_aware_chunking/model_doc_align/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3e3faf971e9fc9a0f7bb39fa55f7c4049a1d286 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_24.txt @@ -0,0 +1 @@ +A blog post on ALIGN and the COYO-700M dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_25.txt b/chunked/content_aware_chunking/model_doc_align/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c3e6c7b75c5bcc5c7814f9886beba0f61dd35d6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_25.txt @@ -0,0 +1 @@ +A zero-shot image classification demo. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_26.txt b/chunked/content_aware_chunking/model_doc_align/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..548111076d6e654486033207dd4ae9d4bab990fa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_26.txt @@ -0,0 +1 @@ +Model card of kakaobrain/align-base model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_27.txt b/chunked/content_aware_chunking/model_doc_align/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7b997fc410fec140a8b1f0a0f029593247bf8b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_27.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_28.txt b/chunked/content_aware_chunking/model_doc_align/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_28.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_align/chunk_29.txt b/chunked/content_aware_chunking/model_doc_align/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..3eab4474f5c8cddc87924c65cd27390ac53d8dc5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_align/chunk_29.txt @@ -0,0 +1,20 @@ +AlignConfig +[[autodoc]] AlignConfig + - from_text_vision_configs +AlignTextConfig +[[autodoc]] AlignTextConfig +AlignVisionConfig +[[autodoc]] AlignVisionConfig +AlignProcessor +[[autodoc]] AlignProcessor +AlignModel +[[autodoc]] AlignModel + - forward + - get_text_features + - get_image_features +AlignTextModel +[[autodoc]] AlignTextModel + - forward +AlignVisionModel +[[autodoc]] AlignVisionModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_10.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8e400d2e3f18f69fa03c814f1f3cce42518ecf0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_10.txt @@ -0,0 +1 @@ +the difference between CLIP is the text encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_11.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..182c2d9eb300c537d67722a64142453f2f1b8050 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_11.txt @@ -0,0 +1,2 @@ +Note that we use bidirectional attention instead of casual attention +and we take the [CLS] token in XLM-R to represent text embedding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_12.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..36cb01d5a2bf2d9dfb8eb2de20cbabb5239dff76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_12.txt @@ -0,0 +1 @@ +AltCLIP is a multi-modal vision and language model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_13.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e9a2e30384e44dcd5419ed8b51cf3b0efd662a8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_13.txt @@ -0,0 +1,2 @@ +It can be used for image-text similarity and for zero-shot image +classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_14.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b87f0fc8da286a4c89cbfdb63a243395631fa17 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_14.txt @@ -0,0 +1,2 @@ +AltCLIP uses a ViT like transformer to get visual features and a bidirectional language model to get the text +features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_15.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab29347a1e9ff8b418a939d0435bc5920189e304 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_15.txt @@ -0,0 +1 @@ +Both the text and visual features are then projected to a latent space with identical dimension. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_16.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9d81c7ab2dc4f86d3742a8a4a647b9d5408ebbd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_16.txt @@ -0,0 +1,2 @@ +The dot +product between the projected image and text features is then used as a similar score. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_17.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f9e2d777bd91819099de23f51c6c7cbfbc094ea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_17.txt @@ -0,0 +1,2 @@ +To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches, +which are then linearly embedded. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_18.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..f68ae4c009338b46f96259412267959ff4e93781 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_18.txt @@ -0,0 +1 @@ +A [CLS] token is added to serve as representation of an entire image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_19.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdbaa620c3c182949ec7608b509ebc76149bb765 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_19.txt @@ -0,0 +1,2 @@ +The authors +also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_20.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f7be9a6ebac782160b88b44b40139a1ce54d0d7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_20.txt @@ -0,0 +1 @@ +The [CLIPImageProcessor] can be used to resize (or rescale) and normalize images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_21.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..05030adac0e2e51cd95cd298121cc109642573c9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_21.txt @@ -0,0 +1,2 @@ +The [AltCLIPProcessor] wraps a [CLIPImageProcessor] and a [XLMRobertaTokenizer] into a single instance to both +encode the text and prepare the images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_22.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..af93ebd04c13c4af5628854ffd0d15b75d639ee0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_22.txt @@ -0,0 +1,2 @@ +The following example shows how to get the image-text similarity scores using +[AltCLIPProcessor] and [AltCLIPModel]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_23.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..43690e364ce5f07825dc13022fede825e1a8d318 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_23.txt @@ -0,0 +1,15 @@ +thon + +from PIL import Image +import requests +from transformers import AltCLIPModel, AltCLIPProcessor +model = AltCLIPModel.from_pretrained("BAAI/AltCLIP") +processor = AltCLIPProcessor.from_pretrained("BAAI/AltCLIP") +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True) +outputs = model(**inputs) +logits_per_image = outputs.logits_per_image # this is the image-text similarity score +probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + +This model is based on CLIPModel, use it like you would use the original CLIP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_24.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c193adfe3d1f1d27b5aac187fa09fb86229a19c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_24.txt @@ -0,0 +1,20 @@ +AltCLIPConfig +[[autodoc]] AltCLIPConfig + - from_text_vision_configs +AltCLIPTextConfig +[[autodoc]] AltCLIPTextConfig +AltCLIPVisionConfig +[[autodoc]] AltCLIPVisionConfig +AltCLIPProcessor +[[autodoc]] AltCLIPProcessor +AltCLIPModel +[[autodoc]] AltCLIPModel + - forward + - get_text_features + - get_image_features +AltCLIPTextModel +[[autodoc]] AltCLIPTextModel + - forward +AltCLIPVisionModel +[[autodoc]] AltCLIPVisionModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_8.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ecc7163fd2448fc1dc4aa4f8ea75a09fd250e29 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by jongjyh. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_altclip/chunk_9.txt b/chunked/content_aware_chunking/model_doc_altclip/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a2ac0e88a7984e37a8631988893dc9cd000bd1b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_altclip/chunk_9.txt @@ -0,0 +1,2 @@ +Usage tips and example +The usage of AltCLIP is very similar to the CLIP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..15f3f9f45ee6903ff71012db952a0e99e2f73d43 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_12.txt @@ -0,0 +1,4 @@ +Usage tips + +When fine-tuning the Audio Spectrogram Transformer (AST) on your own dataset, it's recommended to take care of the input normalization (to make +sure the input has mean of 0 and std of 0.5). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b57eda5fdbf528c1aa30c268ce4f0d0af00e9dc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_13.txt @@ -0,0 +1 @@ +[ASTFeatureExtractor] takes care of this. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..458857bf22b0eb0c2980751c7d34381d391f55c5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_14.txt @@ -0,0 +1,2 @@ +Note that it uses the AudioSet +mean and std by default. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3034c2588be6945e3a31d062d16a324112b12939 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_15.txt @@ -0,0 +1,2 @@ +You can check ast/src/get_norm_stats.py to see how +the authors compute the stats for a downstream dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..664b9f217aef933cf049fc7d6b05fc166248840c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_16.txt @@ -0,0 +1,2 @@ +Note that the AST needs a low learning rate (the authors use a 10 times smaller learning rate compared to their CNN model proposed in the +PSLA paper) and converges quickly, so please search for a suitable learning rate and learning rate scheduler for your task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_17.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..598a116acbcace27ee5753da1c1cd7b1811ac018 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_17.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with the Audio Spectrogram Transformer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_18.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..da17527b014de897a599cddf05056fa945f93ea6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_18.txt @@ -0,0 +1 @@ +A notebook illustrating inference with AST for audio classification can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_19.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..5bf42bca8ed1e6ac05c5b9f2c46030ce6ef2c3bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_19.txt @@ -0,0 +1 @@ +[ASTForAudioClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_20.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..0407f95f18d099eadb6a53b65039ac4069dd2bae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_20.txt @@ -0,0 +1 @@ +See also: Audio classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_21.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_21.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_22.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_22.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_23.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..285842ae3f26a2a7efdcdf19c14c509ce6e7435e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_23.txt @@ -0,0 +1,11 @@ +ASTConfig +[[autodoc]] ASTConfig +ASTFeatureExtractor +[[autodoc]] ASTFeatureExtractor + - call +ASTModel +[[autodoc]] ASTModel + - forward +ASTForAudioClassification +[[autodoc]] ASTForAudioClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_6.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..0aae2301932484a9c505f67850d0a7d14da4b593 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_6.txt @@ -0,0 +1 @@ +In this paper, we answer the question by introducing the Audio Spectrogram Transformer (AST), the first convolution-free, purely attention-based model for audio classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3969910761f4dd56610c38743ae532ef314cb83 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_7.txt @@ -0,0 +1 @@ +We evaluate AST on various audio classification benchmarks, where it achieves new state-of-the-art results of 0.485 mAP on AudioSet, 95.6% accuracy on ESC-50, and 98.1% accuracy on Speech Commands V2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d147f047bcdf48512b68355e4abb5fcaf9fd291a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_8.txt @@ -0,0 +1 @@ +Audio Spectrogram Transformer architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_audio-spectrogram-transformer/chunk_9.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_10.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..089b82977d412061d28abf4491bfc9bb8c8cf494 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_10.txt @@ -0,0 +1,8 @@ +AutoModel +[[autodoc]] AutoModel +TFAutoModel +[[autodoc]] TFAutoModel +FlaxAutoModel +[[autodoc]] FlaxAutoModel +Generic pretraining classes +The following auto classes are available for instantiating a model with a pretraining head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_11.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9146845a0494f7922a7ab8eba01742167e6ffc8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_11.txt @@ -0,0 +1,8 @@ +AutoModelForPreTraining +[[autodoc]] AutoModelForPreTraining +TFAutoModelForPreTraining +[[autodoc]] TFAutoModelForPreTraining +FlaxAutoModelForPreTraining +[[autodoc]] FlaxAutoModelForPreTraining +Natural Language Processing +The following auto classes are available for the following natural language processing tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_12.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3cb5e2a94a7c7af3ae21fe0187c7440b5a2d314 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_12.txt @@ -0,0 +1,58 @@ +AutoModelForCausalLM +[[autodoc]] AutoModelForCausalLM +TFAutoModelForCausalLM +[[autodoc]] TFAutoModelForCausalLM +FlaxAutoModelForCausalLM +[[autodoc]] FlaxAutoModelForCausalLM +AutoModelForMaskedLM +[[autodoc]] AutoModelForMaskedLM +TFAutoModelForMaskedLM +[[autodoc]] TFAutoModelForMaskedLM +FlaxAutoModelForMaskedLM +[[autodoc]] FlaxAutoModelForMaskedLM +AutoModelForMaskGeneration +[[autodoc]] AutoModelForMaskGeneration +TFAutoModelForMaskGeneration +[[autodoc]] TFAutoModelForMaskGeneration +AutoModelForSeq2SeqLM +[[autodoc]] AutoModelForSeq2SeqLM +TFAutoModelForSeq2SeqLM +[[autodoc]] TFAutoModelForSeq2SeqLM +FlaxAutoModelForSeq2SeqLM +[[autodoc]] FlaxAutoModelForSeq2SeqLM +AutoModelForSequenceClassification +[[autodoc]] AutoModelForSequenceClassification +TFAutoModelForSequenceClassification +[[autodoc]] TFAutoModelForSequenceClassification +FlaxAutoModelForSequenceClassification +[[autodoc]] FlaxAutoModelForSequenceClassification +AutoModelForMultipleChoice +[[autodoc]] AutoModelForMultipleChoice +TFAutoModelForMultipleChoice +[[autodoc]] TFAutoModelForMultipleChoice +FlaxAutoModelForMultipleChoice +[[autodoc]] FlaxAutoModelForMultipleChoice +AutoModelForNextSentencePrediction +[[autodoc]] AutoModelForNextSentencePrediction +TFAutoModelForNextSentencePrediction +[[autodoc]] TFAutoModelForNextSentencePrediction +FlaxAutoModelForNextSentencePrediction +[[autodoc]] FlaxAutoModelForNextSentencePrediction +AutoModelForTokenClassification +[[autodoc]] AutoModelForTokenClassification +TFAutoModelForTokenClassification +[[autodoc]] TFAutoModelForTokenClassification +FlaxAutoModelForTokenClassification +[[autodoc]] FlaxAutoModelForTokenClassification +AutoModelForQuestionAnswering +[[autodoc]] AutoModelForQuestionAnswering +TFAutoModelForQuestionAnswering +[[autodoc]] TFAutoModelForQuestionAnswering +FlaxAutoModelForQuestionAnswering +[[autodoc]] FlaxAutoModelForQuestionAnswering +AutoModelForTextEncoding +[[autodoc]] AutoModelForTextEncoding +TFAutoModelForTextEncoding +[[autodoc]] TFAutoModelForTextEncoding +Computer vision +The following auto classes are available for the following computer vision tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_13.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..47cb1ce9495396588299eb100427c46505a7d119 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_13.txt @@ -0,0 +1,36 @@ +AutoModelForDepthEstimation +[[autodoc]] AutoModelForDepthEstimation +AutoModelForImageClassification +[[autodoc]] AutoModelForImageClassification +TFAutoModelForImageClassification +[[autodoc]] TFAutoModelForImageClassification +FlaxAutoModelForImageClassification +[[autodoc]] FlaxAutoModelForImageClassification +AutoModelForVideoClassification +[[autodoc]] AutoModelForVideoClassification +AutoModelForMaskedImageModeling +[[autodoc]] AutoModelForMaskedImageModeling +TFAutoModelForMaskedImageModeling +[[autodoc]] TFAutoModelForMaskedImageModeling +AutoModelForObjectDetection +[[autodoc]] AutoModelForObjectDetection +AutoModelForImageSegmentation +[[autodoc]] AutoModelForImageSegmentation +AutoModelForImageToImage +[[autodoc]] AutoModelForImageToImage +AutoModelForSemanticSegmentation +[[autodoc]] AutoModelForSemanticSegmentation +TFAutoModelForSemanticSegmentation +[[autodoc]] TFAutoModelForSemanticSegmentation +AutoModelForInstanceSegmentation +[[autodoc]] AutoModelForInstanceSegmentation +AutoModelForUniversalSegmentation +[[autodoc]] AutoModelForUniversalSegmentation +AutoModelForZeroShotImageClassification +[[autodoc]] AutoModelForZeroShotImageClassification +TFAutoModelForZeroShotImageClassification +[[autodoc]] TFAutoModelForZeroShotImageClassification +AutoModelForZeroShotObjectDetection +[[autodoc]] AutoModelForZeroShotObjectDetection +Audio +The following auto classes are available for the following audio tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_14.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cf8e0de63f3733a9bbe6386f9367da3e3a9aed6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_14.txt @@ -0,0 +1,22 @@ +AutoModelForAudioClassification +[[autodoc]] AutoModelForAudioClassification +AutoModelForAudioFrameClassification +[[autodoc]] TFAutoModelForAudioClassification +TFAutoModelForAudioFrameClassification +[[autodoc]] AutoModelForAudioFrameClassification +AutoModelForCTC +[[autodoc]] AutoModelForCTC +AutoModelForSpeechSeq2Seq +[[autodoc]] AutoModelForSpeechSeq2Seq +TFAutoModelForSpeechSeq2Seq +[[autodoc]] TFAutoModelForSpeechSeq2Seq +FlaxAutoModelForSpeechSeq2Seq +[[autodoc]] FlaxAutoModelForSpeechSeq2Seq +AutoModelForAudioXVector +[[autodoc]] AutoModelForAudioXVector +AutoModelForTextToSpectrogram +[[autodoc]] AutoModelForTextToSpectrogram +AutoModelForTextToWaveform +[[autodoc]] AutoModelForTextToWaveform +Multimodal +The following auto classes are available for the following multimodal tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_15.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3211a03f2e13bbf263937e6b90951016b9eec80 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_15.txt @@ -0,0 +1,16 @@ +AutoModelForTableQuestionAnswering +[[autodoc]] AutoModelForTableQuestionAnswering +TFAutoModelForTableQuestionAnswering +[[autodoc]] TFAutoModelForTableQuestionAnswering +AutoModelForDocumentQuestionAnswering +[[autodoc]] AutoModelForDocumentQuestionAnswering +TFAutoModelForDocumentQuestionAnswering +[[autodoc]] TFAutoModelForDocumentQuestionAnswering +AutoModelForVisualQuestionAnswering +[[autodoc]] AutoModelForVisualQuestionAnswering +AutoModelForVision2Seq +[[autodoc]] AutoModelForVision2Seq +TFAutoModelForVision2Seq +[[autodoc]] TFAutoModelForVision2Seq +FlaxAutoModelForVision2Seq +[[autodoc]] FlaxAutoModelForVision2Seq \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_3.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d7a26bacdc4d2323f1466abb7b78afd18f9d0e9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_3.txt @@ -0,0 +1,4 @@ +For instance +python +model = AutoModel.from_pretrained("google-bert/bert-base-cased") +will create a model that is an instance of [BertModel]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_4.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f7cea80fd4da3b88900f498bebfdc6893e0a890 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_4.txt @@ -0,0 +1 @@ +There is one class of AutoModel for each task, and for each backend (PyTorch, TensorFlow, or Flax). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_5.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb6140f2b3210246687c52ebecc1de880a580abe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_5.txt @@ -0,0 +1,2 @@ +Extending the Auto Classes +Each of the auto classes has a method to be extended with your custom classes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_6.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d31931019a3a0d0e7f003e56407b57ae77d8843 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_6.txt @@ -0,0 +1,9 @@ +For instance, if you have defined a +custom class of model NewModel, make sure you have a NewModelConfig then you can add those to the auto +classes like this: +thon +from transformers import AutoConfig, AutoModel +AutoConfig.register("new-model", NewModelConfig) +AutoModel.register(NewModelConfig, NewModel) + +You will then be able to use the auto classes like you would usually do! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_7.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..19415e0a4e9162fe753fdfcf0d4667189f94da76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_7.txt @@ -0,0 +1,2 @@ +If your NewModelConfig is a subclass of [~transformers.PretrainedConfig], make sure its +model_type attribute is set to the same key you use when registering the config (here "new-model"). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_8.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f448f5b546a048a6488b7b64a5c76f6050ee802 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_8.txt @@ -0,0 +1,3 @@ +Likewise, if your NewModel is a subclass of [PreTrainedModel], make sure its +config_class attribute is set to the same class you use when registering the model (here +NewModelConfig). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_auto/chunk_9.txt b/chunked/content_aware_chunking/model_doc_auto/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0e62b486b3ec12d0472828d3721b66b2c7f2f22 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_auto/chunk_9.txt @@ -0,0 +1,12 @@ +AutoConfig +[[autodoc]] AutoConfig +AutoTokenizer +[[autodoc]] AutoTokenizer +AutoFeatureExtractor +[[autodoc]] AutoFeatureExtractor +AutoImageProcessor +[[autodoc]] AutoImageProcessor +AutoProcessor +[[autodoc]] AutoProcessor +Generic model classes +The following auto classes are available for instantiating a base model class without a specific head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a7fb771e2684b71a355e677eadabcabd5b1e8d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_10.txt @@ -0,0 +1 @@ +Further, inspired by the stochastic process theory, we design the Auto-Correlation mechanism based on the series periodicity, which conducts the dependencies discovery and representation aggregation at the sub-series level. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..69e48631a06582b98924ce29094987945c35dfd9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_11.txt @@ -0,0 +1 @@ +Auto-Correlation outperforms self-attention in both efficiency and accuracy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..702cf6bcd7fe1ac69c29925b246aff312f28a45e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_12.txt @@ -0,0 +1 @@ +In long-term forecasting, Autoformer yields state-of-the-art accuracy, with a 38% relative improvement on six benchmarks, covering five practical applications: energy, traffic, economics, weather and disease. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd239efa357b6b66d24250ed6905454a55081258 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by elisim and kashif. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..da320e00afa0cc964a5a7ef63dbffe0646db1773 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_15.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_16.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_17.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_17.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_18.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bbfb372b4353e07b3241881e679d5912f22b203 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_18.txt @@ -0,0 +1,10 @@ +Check out the Autoformer blog-post in HuggingFace blog: Yes, Transformers are Effective for Time Series Forecasting (+ Autoformer) + +AutoformerConfig +[[autodoc]] AutoformerConfig +AutoformerModel +[[autodoc]] AutoformerModel + - forward +AutoformerForPrediction +[[autodoc]] AutoformerForPrediction + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..2764d15e9d425e8fb5c83cf4ca0a8552f9cce655 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_7.txt @@ -0,0 +1 @@ +Going beyond Transformers, we design Autoformer as a novel decomposition architecture with an Auto-Correlation mechanism. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..c11884a9bb3aec98c3f26b2b33b2026bb58c5cff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_8.txt @@ -0,0 +1 @@ +We break with the pre-processing convention of series decomposition and renovate it as a basic inner block of deep models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_autoformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_autoformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dab50defb8fbd8f8b89789a7dec9aaab3223438 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_autoformer/chunk_9.txt @@ -0,0 +1 @@ +This design empowers Autoformer with progressive decomposition capacities for complex time series. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_12.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d7bd1823da159f50965dc1c257552f7a5d56e9c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_12.txt @@ -0,0 +1 @@ +In other words, while one sub-model is in use, the other sub-models are idle. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_13.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d61178d00c5050500c9258e496ab4684df7e2a05 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_13.txt @@ -0,0 +1 @@ +If you're using a CUDA device, a simple solution to benefit from an 80% reduction in memory footprint is to offload the submodels from GPU to CPU when they're idle. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_14.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..daaf9aac950eb1d522228a5ccca9c14af577675f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_14.txt @@ -0,0 +1 @@ +This operation is called CPU offloading. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_15.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d3f6e9e4486b23abb83202a899d9e945e153dea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_15.txt @@ -0,0 +1,4 @@ +You can use it with one line of code as follows: +python +model.enable_cpu_offload() +Note that 🤗 Accelerate must be installed before using this feature. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_16.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d71f0ca38ce811d31038e0900f91c630f1b7b77f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_16.txt @@ -0,0 +1 @@ +Here's how to install it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_17.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..749bc0545f425bd7c439d6aafd74c66a44385032 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_17.txt @@ -0,0 +1,2 @@ +Using Better Transformer +Better Transformer is an 🤗 Optimum feature that performs kernel fusion under the hood. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_18.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..4beac76d86169f36baee96e3dfbf1ff29797047a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_18.txt @@ -0,0 +1 @@ +You can gain 20% to 30% in speed with zero performance degradation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_19.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a35cf0d57174b4ce74a35b46c6a9fb8fbe9b8213 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_19.txt @@ -0,0 +1,4 @@ +It only requires one line of code to export the model to 🤗 Better Transformer: +python +model = model.to_bettertransformer() +Note that 🤗 Optimum must be installed before using this feature. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_20.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..d71f0ca38ce811d31038e0900f91c630f1b7b77f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_20.txt @@ -0,0 +1 @@ +Here's how to install it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_21.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1c5703f3c2673ae2ef6dca08ae4201065186e52 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_21.txt @@ -0,0 +1,2 @@ +Using Flash Attention 2 +Flash Attention 2 is an even faster, optimized version of the previous optimization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_22.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..19b296df268eecde857dac9ddc583af7e670c0aa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_22.txt @@ -0,0 +1,2 @@ +Installation +First, check whether your hardware is compatible with Flash Attention 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_23.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..84c311a54146c988c8e82aaf553ec41c221356a6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_23.txt @@ -0,0 +1 @@ +The latest list of compatible hardware can be found in the official documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_24.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff898854e5e7dd8569f851a9b33915004432a6fa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_24.txt @@ -0,0 +1 @@ +If your hardware is not compatible with Flash Attention 2, you can still benefit from attention kernel optimisations through Better Transformer support covered above. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_25.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c245592d303af87c90f15878a59841c0bfb5daa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_25.txt @@ -0,0 +1,5 @@ +Next, install the latest version of Flash Attention 2: + +pip install -U flash-attn --no-build-isolation +Usage +To load a model using Flash Attention 2, we can pass the attn_implementation="flash_attention_2" flag to .from_pretrained. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_26.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..9846533b4b22f5676af8f062a2b4be3e9a79e214 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_26.txt @@ -0,0 +1 @@ +We'll also load the model in half-precision (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_27.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbdfa55205e710c3659a3b2cf52920acb89d72e0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_27.txt @@ -0,0 +1,5 @@ +torch.float16), since it results in almost no degradation to audio quality but significantly lower memory usage and faster inference: +python +model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device) +Performance comparison +The following diagram shows the latency for the native attention implementation (no optimisation) against Better Transformer and Flash Attention 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_28.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..8489183930e8cba960a14d69dc079b3461adce31 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_28.txt @@ -0,0 +1 @@ +In all cases, we generate 400 semantic tokens on a 40GB A100 GPU with PyTorch 2.1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_29.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1b6a6c15ef4b7447018cd782319ee4275d143da --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_29.txt @@ -0,0 +1,3 @@ +Flash Attention 2 is also consistently faster than Better Transformer, and its performance improves even more as batch sizes increase: + +To put this into perspective, on an NVIDIA A100 and when generating 400 semantic tokens with a batch size of 16, you can get 17 times the throughput and still be 2 seconds faster than generating sentences one by one with the native model implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_30.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..177b0141a1d39e8a2d70c59d2d1296dfd06349e7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_30.txt @@ -0,0 +1 @@ +In other words, all the samples will be generated 17 times faster. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_31.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..48a76db3897e8b9aa4f0867a4f7ab1f601bdf97c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_31.txt @@ -0,0 +1 @@ +At batch size 8, on an NVIDIA A100, Flash Attention 2 is also 10% faster than Better Transformer, and at batch size 16, 25%. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_32.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf9d0f8de52f9e18931a2888677a225ca83aa0d6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_32.txt @@ -0,0 +1,2 @@ +Combining optimization techniques +You can combine optimization techniques, and use CPU offload, half-precision and Flash Attention 2 (or 🤗 Better Transformer) all at once. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_33.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..914794d316d44b5e5bf2ae54f9ee207fe0906410 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_33.txt @@ -0,0 +1,10 @@ +thon +from transformers import BarkModel +import torch +device = "cuda" if torch.cuda.is_available() else "cpu" +load in fp16 and use Flash Attention 2 +model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device) +enable CPU offload +model.enable_cpu_offload() + +Find out more on inference optimization techniques here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_34.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..80de14f8e74e86fd9b2ad6ebe0e811fca2b58ff1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_34.txt @@ -0,0 +1,2 @@ +Usage tips +Suno offers a library of voice presets in a number of languages here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_35.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..33eb293d217cd9feea684db53c68412a13e022a5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_35.txt @@ -0,0 +1 @@ +These presets are also uploaded in the hub here or here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_36.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0effa855cc06e881e8fe0d484ace56078924b55 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_36.txt @@ -0,0 +1,11 @@ +thon + +from transformers import AutoProcessor, BarkModel +processor = AutoProcessor.from_pretrained("suno/bark") +model = BarkModel.from_pretrained("suno/bark") +voice_preset = "v2/en_speaker_6" +inputs = processor("Hello, my dog is cute", voice_preset=voice_preset) +audio_array = model.generate(**inputs) +audio_array = audio_array.cpu().numpy().squeeze() + +Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_37.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..d75d03aea3ab2f9ff8584febf153329d418adcd2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_37.txt @@ -0,0 +1,6 @@ +thon + +Multilingual speech - simplified Chinese +inputs = processor("惊人的ï¼æˆ‘会说中文") +Multilingual speech - French - let's use a voice_preset as well +inputs = processor("Incroyable! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_38.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b9908fb65d788d2ca257d84471fcab1323a94ab --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_38.txt @@ -0,0 +1 @@ +Je peux générer du son. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_39.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..e133debbe6d4081ebd948030ca91fc043615afcb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_39.txt @@ -0,0 +1,2 @@ +", voice_preset="fr_speaker_5") +Bark can also generate music. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_40.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..194d48595027daf697455a97aa802e6e02cfe151 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_40.txt @@ -0,0 +1 @@ +You can help it out by adding music notes around your lyrics. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_41.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9d3fd8975624ac0188b52181d02d022b0957990 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_41.txt @@ -0,0 +1,5 @@ +inputs = processor("♪ Hello, my dog is cute ♪") +audio_array = model.generate(**inputs) +audio_array = audio_array.cpu().numpy().squeeze() + +The model can also produce nonverbal communications like laughing, sighing and crying. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bark/chunk_42.txt b/chunked/content_aware_chunking/model_doc_bark/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c42fb957cbfbe2db690d43044a7eebb7c83db81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bark/chunk_42.txt @@ -0,0 +1,47 @@ +thon + +Adding non-speech cues to the input text +inputs = processor("Hello uh [clears throat], my dog is cute [laughter]") +audio_array = model.generate(**inputs) +audio_array = audio_array.cpu().numpy().squeeze() + +To save the audio, simply take the sample rate from the model config and some scipy utility: +thon + +from scipy.io.wavfile import write as write_wav +save audio to disk, but first take the sample rate from the model config +sample_rate = model.generation_config.sample_rate +write_wav("bark_generation.wav", sample_rate, audio_array) + +BarkConfig +[[autodoc]] BarkConfig + - all +BarkProcessor +[[autodoc]] BarkProcessor + - all + - call +BarkModel +[[autodoc]] BarkModel + - generate + - enable_cpu_offload +BarkSemanticModel +[[autodoc]] BarkSemanticModel + - forward +BarkCoarseModel +[[autodoc]] BarkCoarseModel + - forward +BarkFineModel +[[autodoc]] BarkFineModel + - forward +BarkCausalModel +[[autodoc]] BarkCausalModel + - forward +BarkCoarseConfig +[[autodoc]] BarkCoarseConfig + - all +BarkFineConfig +[[autodoc]] BarkFineConfig + - all +BarkSemanticConfig +[[autodoc]] BarkSemanticConfig + - all \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_10.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..872b47b78ef130e7b457e969720f0351ac41b024 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_10.txt @@ -0,0 +1,12 @@ +A composition of the following transformations are applied on the pretraining tasks for the encoder: + +mask random tokens (like in BERT) + +delete random tokens +mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token) +permute sentences +rotate the document to make it start at a specific token + +Implementation Notes + +Bart doesn't use token_type_ids for sequence classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_11.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..838e9fa9639ed13fb70ca386e23c7788b0de3e0a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_11.txt @@ -0,0 +1,2 @@ +Use [BartTokenizer] or + [~BartTokenizer.encode] to get the proper splitting. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_12.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..699b51e7d25e01365b82d5270b16adbf42cd89ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_12.txt @@ -0,0 +1 @@ +The forward pass of [BartModel] will create the decoder_input_ids if they are not passed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_13.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..a37816c132cd515e9a34317d8eb47f78dba01e93 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_13.txt @@ -0,0 +1 @@ +This is different than some other modeling APIs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_14.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb70c403fe02c0c74a554c6ad3a76a1ea0f1fe87 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_14.txt @@ -0,0 +1 @@ +A typical use case of this feature is mask filling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_15.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..baabeb06c9b7d89ffb361e02a31e73504d3c9912 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_15.txt @@ -0,0 +1,2 @@ +Model predictions are intended to be identical to the original implementation when + forced_bos_token_id=0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_16.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..400851cc970d9b85bb3b2ae7056e5d5e4ebd16ab --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_16.txt @@ -0,0 +1,2 @@ +This only works, however, if the string you pass to + [fairseq.encode] starts with a space. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_17.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..0be76f7c24ac762d34346910fac58045bfedc24a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_17.txt @@ -0,0 +1,2 @@ +[~generation.GenerationMixin.generate] should be used for conditional generation tasks like + summarization, see the example in that docstrings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_18.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..27b5c5ff853e3b1140b25794a55022392ade4840 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_18.txt @@ -0,0 +1,2 @@ +Models that load the facebook/bart-large-cnn weights will not have a mask_token_id, or be able to perform + mask-filling tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_19.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..b718ebb5b9285713f38214a8d4f408e64bff519a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_19.txt @@ -0,0 +1,2 @@ +Mask Filling +The facebook/bart-base and facebook/bart-large checkpoints can be used to fill multi-token masks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_20.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a93474c7c2a33af63296cd742c5dd69d22b086a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_20.txt @@ -0,0 +1,13 @@ +thon +from transformers import BartForConditionalGeneration, BartTokenizer +model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0) +tok = BartTokenizer.from_pretrained("facebook/bart-large") +example_english_phrase = "UN Chief Says There Is No in Syria" +batch = tok(example_english_phrase, return_tensors="pt") +generated_ids = model.generate(batch["input_ids"]) +assert tok.batch_decode(generated_ids, skip_special_tokens=True) == [ + "UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria" +] + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BART. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_21.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_21.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_22.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_22.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_23.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..12824dc434161cb1de6f4e032b0a71e47129612f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_23.txt @@ -0,0 +1 @@ +A blog post on Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_24.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a28f421b59b610c5ceeaaf27f0791af62acbf22 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_24.txt @@ -0,0 +1 @@ +A notebook on how to finetune BART for summarization with fastai using blurr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_25.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..69ce495221af73342ca03d895fe3ce12f5de4f5a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_25.txt @@ -0,0 +1,2 @@ +🌎 +A notebook on how to finetune BART for summarization in two languages with Trainer class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_26.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fb992a49e3d7d521bdfefec9b4fd4c842d56666 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_26.txt @@ -0,0 +1,2 @@ +🌎 +[BartForConditionalGeneration] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_27.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..a477aea62c851d41609438995b39ec6fbf970915 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_27.txt @@ -0,0 +1 @@ +[TFBartForConditionalGeneration] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_28.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..20c044dd4a771281fdb7fd4905dac5da64eb151d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_28.txt @@ -0,0 +1 @@ +[FlaxBartForConditionalGeneration] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_29.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..c912090a6b76c2fc0c1dcd6cd480eba50461f7f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_29.txt @@ -0,0 +1,2 @@ +An example of how to train [BartForConditionalGeneration] with a Hugging Face datasets object can be found in this forum discussion +Summarization chapter of the 🤗 Hugging Face course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_30.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..1dfe6108ccc871dd87897940723417e50b2a1017 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_30.txt @@ -0,0 +1,3 @@ +Summarization task guide + +[BartForConditionalGeneration] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_31.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..a477aea62c851d41609438995b39ec6fbf970915 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_31.txt @@ -0,0 +1 @@ +[TFBartForConditionalGeneration] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_32.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b53f6944cac32d276342d26c0b9581c2a570914 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_32.txt @@ -0,0 +1 @@ +[FlaxBartForConditionalGeneration] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_33.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f2b5fefece97efd08b6147d0c598a5443817bec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_33.txt @@ -0,0 +1 @@ +Masked language modeling chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_34.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..c57e1d2ee02e32378fca8e2e5ef947cca948a13c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_34.txt @@ -0,0 +1,3 @@ +Masked language modeling task guide + +A notebook on how to finetune mBART using Seq2SeqTrainer for Hindi to English translation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_35.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fb992a49e3d7d521bdfefec9b4fd4c842d56666 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_35.txt @@ -0,0 +1,2 @@ +🌎 +[BartForConditionalGeneration] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_36.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..a477aea62c851d41609438995b39ec6fbf970915 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_36.txt @@ -0,0 +1 @@ +[TFBartForConditionalGeneration] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_37.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..34ceea649572892a2c09f9106116f8780663014c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_37.txt @@ -0,0 +1,7 @@ +Translation task guide + +See also: +- Text classification task guide +- Question answering task guide +- Causal language modeling task guide +- Distilled checkpoints are described in this paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bart/chunk_38.txt b/chunked/content_aware_chunking/model_doc_bart/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ece0861c18d6aa3e5ea7ba3382552e64527a0a2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bart/chunk_38.txt @@ -0,0 +1,59 @@ +BartConfig +[[autodoc]] BartConfig + - all +BartTokenizer +[[autodoc]] BartTokenizer + - all +BartTokenizerFast +[[autodoc]] BartTokenizerFast + - all + +BartModel +[[autodoc]] BartModel + - forward +BartForConditionalGeneration +[[autodoc]] BartForConditionalGeneration + - forward +BartForSequenceClassification +[[autodoc]] BartForSequenceClassification + - forward +BartForQuestionAnswering +[[autodoc]] BartForQuestionAnswering + - forward +BartForCausalLM +[[autodoc]] BartForCausalLM + - forward + +TFBartModel +[[autodoc]] TFBartModel + - call +TFBartForConditionalGeneration +[[autodoc]] TFBartForConditionalGeneration + - call +TFBartForSequenceClassification +[[autodoc]] TFBartForSequenceClassification + - call + +FlaxBartModel +[[autodoc]] FlaxBartModel + - call + - encode + - decode +FlaxBartForConditionalGeneration +[[autodoc]] FlaxBartForConditionalGeneration + - call + - encode + - decode +FlaxBartForSequenceClassification +[[autodoc]] FlaxBartForSequenceClassification + - call + - encode + - decode +FlaxBartForQuestionAnswering +[[autodoc]] FlaxBartForQuestionAnswering + - call + - encode + - decode +FlaxBartForCausalLM +[[autodoc]] FlaxBartForCausalLM + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_barthez/chunk_10.txt b/chunked/content_aware_chunking/model_doc_barthez/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..0df06e0d4c5aaae75f2e41b31c35fca12e3215d3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_barthez/chunk_10.txt @@ -0,0 +1 @@ +BARThez implementation is the same as BART, except for tokenization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_barthez/chunk_11.txt b/chunked/content_aware_chunking/model_doc_barthez/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2f27e74075dbfffd0a9d06b9ec1a29851964c9d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_barthez/chunk_11.txt @@ -0,0 +1,2 @@ +Refer to BART documentation for information on +configuration classes and their parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_barthez/chunk_12.txt b/chunked/content_aware_chunking/model_doc_barthez/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..2dbed54fe8fd5bfd3c2de33e66777b9e60ce6e05 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_barthez/chunk_12.txt @@ -0,0 +1 @@ +BARThez-specific tokenizers are documented below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_barthez/chunk_13.txt b/chunked/content_aware_chunking/model_doc_barthez/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..88451001fae1839021d7d80f2b5333e86fe92f10 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_barthez/chunk_13.txt @@ -0,0 +1,4 @@ +Resources + +BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check: + examples/pytorch/summarization/. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_barthez/chunk_14.txt b/chunked/content_aware_chunking/model_doc_barthez/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..e436d03bab74baf928ffceb07ef150728c594dea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_barthez/chunk_14.txt @@ -0,0 +1,4 @@ +BarthezTokenizer +[[autodoc]] BarthezTokenizer +BarthezTokenizerFast +[[autodoc]] BarthezTokenizerFast \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_barthez/chunk_5.txt b/chunked/content_aware_chunking/model_doc_barthez/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..31591d1cc2f6bd7b241750c395ff0dfeaa07a358 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_barthez/chunk_5.txt @@ -0,0 +1,3 @@ +Unlike already existing BERT-based French language models such as +CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks, since not only its encoder but also +its decoder is pretrained. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_barthez/chunk_6.txt b/chunked/content_aware_chunking/model_doc_barthez/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..572843db5e3ecbfb6da727470b63ac9876529d5a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_barthez/chunk_6.txt @@ -0,0 +1,2 @@ +In addition to discriminative tasks from the FLUE benchmark, we evaluate BARThez on a novel +summarization dataset, OrangeSum, that we release with this paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_barthez/chunk_7.txt b/chunked/content_aware_chunking/model_doc_barthez/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..5605a935fbf49d40cb8d2fb8e4080951692ac930 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_barthez/chunk_7.txt @@ -0,0 +1,3 @@ +We also continue the pretraining of an already +pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez, +provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_barthez/chunk_8.txt b/chunked/content_aware_chunking/model_doc_barthez/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f054d4770b2ee772d2f1324ada758bb1143a870 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_barthez/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by moussakam. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_barthez/chunk_9.txt b/chunked/content_aware_chunking/model_doc_barthez/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c3d0926c3557f620bec98b6a4bd85cc9cc7733a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_barthez/chunk_9.txt @@ -0,0 +1 @@ +The Authors' code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bartpho/chunk_10.txt b/chunked/content_aware_chunking/model_doc_bartpho/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..29b062f184f62f196b3f722c6d085e4bca029d0f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bartpho/chunk_10.txt @@ -0,0 +1,7 @@ +For example: + +thon + +from transformers import MBartForConditionalGeneration +bartpho = MBartForConditionalGeneration.from_pretrained("vinai/bartpho-syllable") +TXT = "Chúng tôi là nghiên cứu viên." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bartpho/chunk_11.txt b/chunked/content_aware_chunking/model_doc_bartpho/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ce57eeb786d1479cbacfc0ed21b16cf498b4627 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bartpho/chunk_11.txt @@ -0,0 +1,9 @@ +input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"] +logits = bartpho(input_ids).logits +masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() +probs = logits[0, masked_index].softmax(dim=0) +values, predictions = probs.topk(5) +print(tokenizer.decode(predictions).split()) + +This implementation is only for tokenization: "monolingual_vocab_file" consists of Vietnamese-specialized types + extracted from the pre-trained SentencePiece model "vocab_file" that is available from the multilingual XLM-RoBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bartpho/chunk_12.txt b/chunked/content_aware_chunking/model_doc_bartpho/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..f66f19479070d999472776287dfa60dfcdb6bc23 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bartpho/chunk_12.txt @@ -0,0 +1,2 @@ +Other languages, if employing this pre-trained multilingual SentencePiece model "vocab_file" for subword + segmentation, can reuse BartphoTokenizer with their own language-specialized "monolingual_vocab_file". \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bartpho/chunk_13.txt b/chunked/content_aware_chunking/model_doc_bartpho/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c74c427e5588a8bfdd513e507321a3e2315d671f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bartpho/chunk_13.txt @@ -0,0 +1,2 @@ +BartphoTokenizer +[[autodoc]] BartphoTokenizer \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bartpho/chunk_4.txt b/chunked/content_aware_chunking/model_doc_bartpho/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd2a7a812526a52257deb60e2e6f159b084efca3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bartpho/chunk_4.txt @@ -0,0 +1,2 @@ +We release BARTpho to facilitate future +research and applications of generative Vietnamese NLP tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bartpho/chunk_5.txt b/chunked/content_aware_chunking/model_doc_bartpho/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d9f19a09b095fd4317c9007e0933e739cacbc67 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bartpho/chunk_5.txt @@ -0,0 +1 @@ +This model was contributed by dqnguyen. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bartpho/chunk_6.txt b/chunked/content_aware_chunking/model_doc_bartpho/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bartpho/chunk_6.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bartpho/chunk_7.txt b/chunked/content_aware_chunking/model_doc_bartpho/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..142cfa99161459f518eaed8f4aa39674e3453d35 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bartpho/chunk_7.txt @@ -0,0 +1,8 @@ +Usage example +thon + +import torch +from transformers import AutoModel, AutoTokenizer +bartpho = AutoModel.from_pretrained("vinai/bartpho-syllable") +tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable") +line = "Chúng tôi là những nghiên cứu viên." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bartpho/chunk_8.txt b/chunked/content_aware_chunking/model_doc_bartpho/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..12c7a782afadd3e2c749a8bf0c7c60096b554a8c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bartpho/chunk_8.txt @@ -0,0 +1,13 @@ +input_ids = tokenizer(line, return_tensors="pt") +with torch.no_grad(): + features = bartpho(**input_ids) # Models outputs are now tuples +With TensorFlow 2.0+: +from transformers import TFAutoModel +bartpho = TFAutoModel.from_pretrained("vinai/bartpho-syllable") +input_ids = tokenizer(line, return_tensors="tf") +features = bartpho(**input_ids) + +Usage tips + +Following mBART, BARTpho uses the "large" architecture of BART with an additional layer-normalization layer on top of + both the encoder and decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bartpho/chunk_9.txt b/chunked/content_aware_chunking/model_doc_bartpho/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b92e1e80ad0eaec3b9fa48c07b39037ca8a0b7a3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bartpho/chunk_9.txt @@ -0,0 +1,2 @@ +Thus, usage examples in the documentation of BART, when adapting to use + with BARTpho, should be adjusted by replacing the BART-specialized classes with the mBART-specialized counterparts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_11.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ef4c93635a56d2cfdd51ab232dfaa4781d7f2e1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_11.txt @@ -0,0 +1,2 @@ +For example, base-size BEiT achieves 83.2% top-1 accuracy on ImageNet-1K, +significantly outperforming from-scratch DeiT training (81.8%) with the same setup. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_12.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..949492a77ffbbf6a6e3175bd45a0cc913564e26c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_12.txt @@ -0,0 +1,2 @@ +Moreover, large-size BEiT obtains +86.3% only using ImageNet-1K, even outperforming ViT-L with supervised pre-training on ImageNet-22K (85.2%). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_13.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_14.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..8571c753000a20a1d2d6ee156e3246b0bfc2de1b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_14.txt @@ -0,0 +1,2 @@ +The JAX/FLAX version of this model was +contributed by kamalkraj. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_15.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_15.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_16.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..0049753aa1b44afcee8a2129e85637c3d57544c9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_16.txt @@ -0,0 +1,3 @@ +Usage tips + +BEiT models are regular Vision Transformers, but pre-trained in a self-supervised way rather than supervised. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_17.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfc74136ccc8eb8e0e5d51bc59ab40589d794311 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_17.txt @@ -0,0 +1,2 @@ +They + outperform both the original model (ViT) as well as Data-efficient Image Transformers (DeiT) when fine-tuned on ImageNet-1K and CIFAR-100. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_18.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..b150f902041a8d62f1d8170ae315c2328219db1c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_18.txt @@ -0,0 +1,4 @@ +You can check out demo notebooks regarding inference as well as + fine-tuning on custom data here (you can just replace + [ViTFeatureExtractor] by [BeitImageProcessor] and + [ViTForImageClassification] by [BeitForImageClassification]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_19.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a75eff1fb75ed12d8bbd21c48d89c3ea318f255c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_19.txt @@ -0,0 +1,2 @@ +There's also a demo notebook available which showcases how to combine DALL-E's image tokenizer with BEiT for + performing masked image modeling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_20.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..47b22602e54eed8b0f4c7e06c45d8828f94264de --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_20.txt @@ -0,0 +1 @@ +You can find it here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_21.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..47fde230a941bf4b5968227557426d660b7611ba --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_21.txt @@ -0,0 +1,2 @@ +As the BEiT models expect each image to be of the same size (resolution), one can use + [BeitImageProcessor] to resize (or rescale) and normalize images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_22.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bb110e140728327213fd6eecfc806937373f9fc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_22.txt @@ -0,0 +1,2 @@ +Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of + each checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_23.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..d104bee50a0c4c85e0f7c816d1bbdee023ebc64a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_23.txt @@ -0,0 +1,2 @@ +For example, microsoft/beit-base-patch16-224 refers to a base-sized architecture with patch + resolution of 16x16 and fine-tuning resolution of 224x224. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_24.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..e67ff0b65f0c0a39dd9635e89c149fdccf8fcd69 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_24.txt @@ -0,0 +1 @@ +All checkpoints can be found on the hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_25.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..24d2b0b9a7e11167c770d1c8e5c036b1e1c5c710 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_25.txt @@ -0,0 +1,3 @@ +The available checkpoints are either (1) pre-trained on ImageNet-22k (a collection of + 14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on ImageNet-1k (also referred to as ILSVRC 2012, a collection of 1.3 million + images and 1,000 classes). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_26.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7bf9c1f560cefe8bcf1454763765563114dfb36 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_26.txt @@ -0,0 +1 @@ +BEiT uses relative position embeddings, inspired by the T5 model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_27.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6ac8221c915840cb8bfd7d17047d0a547c56005 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_27.txt @@ -0,0 +1,2 @@ +During pre-training, the authors shared the + relative position bias among the several self-attention layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_28.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a7eb05577cc1d26055dc05224e4d4d5cb40e3cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_28.txt @@ -0,0 +1,2 @@ +During fine-tuning, each layer's relative position + bias is initialized with the shared relative position bias obtained after pre-training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_29.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..31987ba2e8d689bb3681a3848ea6af5f425ce069 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_29.txt @@ -0,0 +1,4 @@ +Note that, if one wants to + pre-train a model from scratch, one needs to either set the use_relative_position_bias or the + use_relative_position_bias attribute of [BeitConfig] to True in order to add + position embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_30.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf4ebb45e3ceb4468dc70e115aeb2124a1f902a1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_30.txt @@ -0,0 +1 @@ +BEiT pre-training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_31.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_31.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_32.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..56bb4d130a68be69ecdab5c08f83a0d57618df44 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_32.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BEiT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_33.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..d77944ce7cef77c14880486e76214b4f75d5be06 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_33.txt @@ -0,0 +1 @@ +[BeitForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_34.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a615c69674a74107b4aa3e1e0caf11d765d0b2d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_34.txt @@ -0,0 +1,5 @@ +See also: Image classification task guide + +Semantic segmentation +- Semantic segmentation task guide +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_35.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_35.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_beit/chunk_36.txt b/chunked/content_aware_chunking/model_doc_beit/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb844685f5dad040d4679c05b767f0a4cf37ae73 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_beit/chunk_36.txt @@ -0,0 +1,36 @@ +BEiT specific outputs +[[autodoc]] models.beit.modeling_beit.BeitModelOutputWithPooling +[[autodoc]] models.beit.modeling_flax_beit.FlaxBeitModelOutputWithPooling +BeitConfig +[[autodoc]] BeitConfig +BeitFeatureExtractor +[[autodoc]] BeitFeatureExtractor + - call + - post_process_semantic_segmentation +BeitImageProcessor +[[autodoc]] BeitImageProcessor + - preprocess + - post_process_semantic_segmentation + +BeitModel +[[autodoc]] BeitModel + - forward +BeitForMaskedImageModeling +[[autodoc]] BeitForMaskedImageModeling + - forward +BeitForImageClassification +[[autodoc]] BeitForImageClassification + - forward +BeitForSemanticSegmentation +[[autodoc]] BeitForSemanticSegmentation + - forward + +FlaxBeitModel +[[autodoc]] FlaxBeitModel + - call +FlaxBeitForMaskedImageModeling +[[autodoc]] FlaxBeitForMaskedImageModeling + - call +FlaxBeitForImageClassification +[[autodoc]] FlaxBeitForImageClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-generation/chunk_10.txt b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c68cb89eb3e39a76d805f3dc321d9334a047a314 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_10.txt @@ -0,0 +1,8 @@ +: +thon + +instantiate sentence fusion model +sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse") +tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse") +input_ids = tokenizer( + "This is the first sentence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-generation/chunk_11.txt b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..0afab6af59a5f19bcc34c787ab7fb7244efc8ba1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_11.txt @@ -0,0 +1 @@ +This is the second sentence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-generation/chunk_12.txt b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b66c2cdaa75f553ade6a7bf1fd622cd176b45ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_12.txt @@ -0,0 +1,9 @@ +", add_special_tokens=False, return_tensors="pt" + ).input_ids +outputs = sentence_fuser.generate(input_ids) +print(tokenizer.decode(outputs[0])) + +Tips: + +[BertGenerationEncoder] and [BertGenerationDecoder] should be used in + combination with [EncoderDecoder]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-generation/chunk_13.txt b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..81f85aee8da7bdcdff9fa0a55ff808e8ef67d25f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_13.txt @@ -0,0 +1 @@ +For summarization, sentence splitting, sentence fusion and translation, no special tokens are required for the input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-generation/chunk_14.txt b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ac0d5ff31a093e18174de3a0c9692826b824850 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_14.txt @@ -0,0 +1 @@ +Therefore, no EOS token should be added to the end of the input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-generation/chunk_15.txt b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..229cd6fcfe528efc0c434b97f2a28426c4fd6e21 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_15.txt @@ -0,0 +1,11 @@ +BertGenerationConfig +[[autodoc]] BertGenerationConfig +BertGenerationTokenizer +[[autodoc]] BertGenerationTokenizer + - save_vocabulary +BertGenerationEncoder +[[autodoc]] BertGenerationEncoder + - forward +BertGenerationDecoder +[[autodoc]] BertGenerationDecoder + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-generation/chunk_5.txt b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..39e494c53be50eea329ae2d506ca6e7672c40049 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_5.txt @@ -0,0 +1,4 @@ +We +developed a Transformer-based sequence-to-sequence model that is compatible with publicly available pre-trained BERT, +GPT-2 and RoBERTa checkpoints and conducted an extensive empirical study on the utility of initializing our model, both +encoder and decoder, with these checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-generation/chunk_6.txt b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..412f981d07e2c5dbac391a3c929779f0da458925 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_6.txt @@ -0,0 +1,2 @@ +Our models result in new state-of-the-art results on Machine Translation, +Text Summarization, Sentence Splitting, and Sentence Fusion. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-generation/chunk_7.txt b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8d9a4a80c39655451ecdeff278033b32e2fe584 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_7.txt @@ -0,0 +1 @@ +This model was contributed by patrickvonplaten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-generation/chunk_8.txt b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf67fffa4f5d9001fd37fd419894f4c6591262b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_8.txt @@ -0,0 +1,2 @@ +The original code can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-generation/chunk_9.txt b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f6cbfc89104112fbb57fb9921a215c4bdbe09ed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-generation/chunk_9.txt @@ -0,0 +1,24 @@ +Usage examples and tips +The model can be used in combination with the [EncoderDecoderModel] to leverage two pretrained BERT checkpoints for +subsequent fine-tuning: +thon + +leverage checkpoints for Bert2Bert model +use BERT's cls token as BOS token and sep token as EOS token +encoder = BertGenerationEncoder.from_pretrained("google-bert/bert-large-uncased", bos_token_id=101, eos_token_id=102) +add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token +decoder = BertGenerationDecoder.from_pretrained( + "google-bert/bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102 + ) +bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder) +create tokenizer +tokenizer = BertTokenizer.from_pretrained("google-bert/bert-large-uncased") +input_ids = tokenizer( + "This is a long article to summarize", add_special_tokens=False, return_tensors="pt" + ).input_ids +labels = tokenizer("This is a short summary", return_tensors="pt").input_ids +train +loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss +loss.backward() + +Pretrained [EncoderDecoderModel] are also directly available in the model hub, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_10.txt b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..536ceda7514feb76d668108ffea3bb17753954c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_10.txt @@ -0,0 +1,2 @@ +BertJapaneseTokenizer +[[autodoc]] BertJapaneseTokenizer \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_3.txt b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..b44cdf2dbeaf46d960677af304e3a22e09cdb26c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_3.txt @@ -0,0 +1 @@ +Tokenize into characters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_4.txt b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..a50e3424740c1e042146d8abd797099b096b6cfc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_4.txt @@ -0,0 +1 @@ +To use MecabTokenizer, you should pip install transformers["ja"] (or pip install -e . \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_5.txt b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..aff8a5854c88f3981dc3d7dd77cfc119e354df44 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_5.txt @@ -0,0 +1,2 @@ +["ja"] if you install +from source) to install dependencies. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_6.txt b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..348f75330a95dec32eb8aff19fad7fde2fa2dbeb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_6.txt @@ -0,0 +1 @@ +See details on cl-tohoku repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_7.txt b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9bb983d37800c761224beb0350608e4247aec35 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_7.txt @@ -0,0 +1,27 @@ +Example of using a model with MeCab and WordPiece tokenization: +thon + +import torch +from transformers import AutoModel, AutoTokenizer +bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese") +tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese") +Input Japanese Text +line = "å¾è¼©ã¯çŒ«ã§ã‚る。" +inputs = tokenizer(line, return_tensors="pt") +print(tokenizer.decode(inputs["input_ids"][0])) +[CLS] å¾è¼© 㯠猫 㧠ã‚ã‚‹ 。 [SEP] +outputs = bertjapanese(**inputs) + +Example of using a model with Character tokenization: +thon + +bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char") +tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char") +Input Japanese Text +line = "å¾è¼©ã¯çŒ«ã§ã‚る。" +inputs = tokenizer(line, return_tensors="pt") +print(tokenizer.decode(inputs["input_ids"][0])) +[CLS] å¾ è¼© 㯠猫 㧠゠る 。 [SEP] +outputs = bertjapanese(**inputs) + +This model was contributed by cl-tohoku. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_8.txt b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a5912af7e9ddb80d6d583ae2af9b38a65b5bae1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_8.txt @@ -0,0 +1 @@ +This implementation is the same as BERT, except for tokenization method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_9.txt b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6959c759efd85cd1e83435815d17ae2c202ff946 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert-japanese/chunk_9.txt @@ -0,0 +1,2 @@ +Refer to BERT documentation for +API reference information. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..136f7bc571271cabad74775edb58406d19faadf4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_11.txt @@ -0,0 +1,2 @@ +It is + efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bfdd9d677d97a7eabd60dd6f2f04eb4e76d0a75 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_12.txt @@ -0,0 +1,7 @@ +Corrupts the inputs by using random masking, more precisely, during pretraining, a given percentage of tokens (usually 15%) is masked by: + +a special mask token with probability 0.8 +a random token different from the one masked with probability 0.1 +the same token with probability 0.1 + +The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a separation token in between). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..9aaf67a0242123f565d4dc01833d70d1e9d1cf2b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_13.txt @@ -0,0 +1 @@ +With probability 50%, the sentences are consecutive in the corpus, in the remaining 50% they are not related. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_14.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ea9897f0c9917789176a0cd7fd26e06120c26de --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_14.txt @@ -0,0 +1 @@ +The model has to predict if the sentences are consecutive or not. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_15.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b05f8e08ca54af88a50da0a8ae71902484877aa6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_15.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_16.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_16.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_17.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_17.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_18.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..7facadef4610fddbfc8d8d07abba584b76c3a444 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_18.txt @@ -0,0 +1 @@ +A blog post on BERT Text Classification in a different language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_19.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcdaae31ae0b2b079744a34fb539dc7b80981ab4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_19.txt @@ -0,0 +1 @@ +A notebook for Finetuning BERT (and friends) for multi-label text classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_20.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..1618605d141a5080822fc8020ccb45fbcbc06d3a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_20.txt @@ -0,0 +1 @@ +A notebook on how to Finetune BERT for multi-label classification using PyTorch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_21.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..80d9a024de7e7ca59df9eec494de749b5c191637 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_21.txt @@ -0,0 +1,2 @@ +🌎 +A notebook on how to warm-start an EncoderDecoder model with BERT for summarization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_22.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..28872ee531d850b03bc3cdd4580a4896e8deca44 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_22.txt @@ -0,0 +1 @@ +[BertForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_23.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a82d68ad43232254dff177d6ec10cf03062c1b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_23.txt @@ -0,0 +1 @@ +[TFBertForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_24.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a9c76d368d3af4e88d4d91520bb93483750b713 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_24.txt @@ -0,0 +1 @@ +[FlaxBertForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_25.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..c488d6622432c4d766668d05e459345a44b62cb5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_25.txt @@ -0,0 +1,3 @@ +Text classification task guide + +A blog post on how to use Hugging Face Transformers with Keras: Fine-tune a non-English BERT for Named Entity Recognition. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_26.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c57693e0bde3f67d09ff261d550e8bc90d64120 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_26.txt @@ -0,0 +1 @@ +A notebook for Finetuning BERT for named-entity recognition using only the first wordpiece of each word in the word label during tokenization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_27.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ddba8a09a9938c6ab46adfb3a5493cd3f9df6f4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_27.txt @@ -0,0 +1 @@ +To propagate the label of the word to all wordpieces, see this version of the notebook instead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_28.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb9459583c2df5610b7449a74e04ac92310352db --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_28.txt @@ -0,0 +1 @@ +[BertForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_29.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..900df7b20a492f5323d7c24d0f62c7370438b675 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_29.txt @@ -0,0 +1 @@ +[TFBertForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_30.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..65e134cce14111dff1f6e5d4b4b25d4a30ca6176 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_30.txt @@ -0,0 +1 @@ +[FlaxBertForTokenClassification] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_31.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..69e21faf2c5098fb807509f480ff122a6a2859c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_31.txt @@ -0,0 +1 @@ +Token classification chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_32.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0d0e67b254a43de0362509eb7d507b7d06cd599 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_32.txt @@ -0,0 +1,3 @@ +Token classification task guide + +[BertForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_33.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..27aca6639f14252a1cebbd3c9010be337faba516 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_33.txt @@ -0,0 +1 @@ +[TFBertForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_34.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbaed20c74d372e5cee9bdc960ffce787b7c04f8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_34.txt @@ -0,0 +1 @@ +[FlaxBertForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_35.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f2b5fefece97efd08b6147d0c598a5443817bec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_35.txt @@ -0,0 +1 @@ +Masked language modeling chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_36.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a5bd49fcdd6bc3e3eca60db6685d9cfdd94fb24 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_36.txt @@ -0,0 +1,3 @@ +Masked language modeling task guide + +[BertForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_37.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d12751a661b123bd5c626f9ab60922491dd43d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_37.txt @@ -0,0 +1 @@ +[TFBertForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_38.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..40487b2d5c18503fc9e4e7e53a64248413efbd14 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_38.txt @@ -0,0 +1 @@ +[FlaxBertForQuestionAnswering] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_39.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..38996d3f4fef4d6454d1d2c12acfb05d3bf81ec8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_39.txt @@ -0,0 +1 @@ +Question answering chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_40.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a70897777fe075a854b32fd5bc80b241bf4aa05 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_40.txt @@ -0,0 +1,4 @@ +Question answering task guide + +Multiple choice +- [BertForMultipleChoice] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_41.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b19bb9a7603ce70c5834cc85920f4ae624567b7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_41.txt @@ -0,0 +1 @@ +- [TFBertForMultipleChoice] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_42.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d1865e556635198c1fef3edab874f580a96bbe3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_42.txt @@ -0,0 +1,3 @@ +- Multiple choice task guide +âš¡ï¸ Inference +- A blog post on how to Accelerate BERT inference with Hugging Face Transformers and AWS Inferentia. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_43.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..7914d596d894e1830b39e84424ff1ecf61dc61b7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_43.txt @@ -0,0 +1 @@ +- A blog post on how to Accelerate BERT inference with DeepSpeed-Inference on GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_44.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..37f9bce1e405e9035c9c3b8cd62f1139ee90ce6d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_44.txt @@ -0,0 +1,2 @@ +âš™ï¸ Pretraining +- A blog post on Pre-Training BERT with Hugging Face Transformers and Habana Gaudi. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_45.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b9d7bae2498708c4c6801ca513ed77d2f686d05 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_45.txt @@ -0,0 +1,2 @@ +🚀 Deploy +- A blog post on how to Convert Transformers to ONNX with Hugging Face Optimum. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_46.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..92f9b9343ecee2611cb987e5401c17f94f9da19b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_46.txt @@ -0,0 +1 @@ +- A blog post on how to Setup Deep Learning environment for Hugging Face Transformers with Habana Gaudi on AWS. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_47.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3662b5bd58aa1b4a715c20ddb6f3fa66f8b25ad --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_47.txt @@ -0,0 +1 @@ +- A blog post on Autoscaling BERT with Hugging Face Transformers, Amazon SageMaker and Terraform module. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_48.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..681ef8b4c0d349494eb4c101f3776b357082bfd0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_48.txt @@ -0,0 +1 @@ +- A blog post on Serverless BERT with HuggingFace, AWS Lambda, and Docker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_49.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..77a0d5e954d744829b2692face68990a7814363e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_49.txt @@ -0,0 +1 @@ +- A blog post on Hugging Face Transformers BERT fine-tuning using Amazon SageMaker and Training Compiler. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_50.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..82f7ddfb1151f2f10cfeeaf1035838f296845e05 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_50.txt @@ -0,0 +1 @@ +- A blog post on Task-specific knowledge distillation for BERT using Transformers & Amazon SageMaker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bert/chunk_51.txt b/chunked/content_aware_chunking/model_doc_bert/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..338883e69a01322db64667b64a928df4a7a7b0c1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bert/chunk_51.txt @@ -0,0 +1,104 @@ +BertConfig +[[autodoc]] BertConfig + - all +BertTokenizer +[[autodoc]] BertTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +BertTokenizerFast +[[autodoc]] BertTokenizerFast + +TFBertTokenizer +[[autodoc]] TFBertTokenizer + +Bert specific outputs +[[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput +[[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput +[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput + +BertModel +[[autodoc]] BertModel + - forward +BertForPreTraining +[[autodoc]] BertForPreTraining + - forward +BertLMHeadModel +[[autodoc]] BertLMHeadModel + - forward +BertForMaskedLM +[[autodoc]] BertForMaskedLM + - forward +BertForNextSentencePrediction +[[autodoc]] BertForNextSentencePrediction + - forward +BertForSequenceClassification +[[autodoc]] BertForSequenceClassification + - forward +BertForMultipleChoice +[[autodoc]] BertForMultipleChoice + - forward +BertForTokenClassification +[[autodoc]] BertForTokenClassification + - forward +BertForQuestionAnswering +[[autodoc]] BertForQuestionAnswering + - forward + +TFBertModel +[[autodoc]] TFBertModel + - call +TFBertForPreTraining +[[autodoc]] TFBertForPreTraining + - call +TFBertModelLMHeadModel +[[autodoc]] TFBertLMHeadModel + - call +TFBertForMaskedLM +[[autodoc]] TFBertForMaskedLM + - call +TFBertForNextSentencePrediction +[[autodoc]] TFBertForNextSentencePrediction + - call +TFBertForSequenceClassification +[[autodoc]] TFBertForSequenceClassification + - call +TFBertForMultipleChoice +[[autodoc]] TFBertForMultipleChoice + - call +TFBertForTokenClassification +[[autodoc]] TFBertForTokenClassification + - call +TFBertForQuestionAnswering +[[autodoc]] TFBertForQuestionAnswering + - call + +FlaxBertModel +[[autodoc]] FlaxBertModel + - call +FlaxBertForPreTraining +[[autodoc]] FlaxBertForPreTraining + - call +FlaxBertForCausalLM +[[autodoc]] FlaxBertForCausalLM + - call +FlaxBertForMaskedLM +[[autodoc]] FlaxBertForMaskedLM + - call +FlaxBertForNextSentencePrediction +[[autodoc]] FlaxBertForNextSentencePrediction + - call +FlaxBertForSequenceClassification +[[autodoc]] FlaxBertForSequenceClassification + - call +FlaxBertForMultipleChoice +[[autodoc]] FlaxBertForMultipleChoice + - call +FlaxBertForTokenClassification +[[autodoc]] FlaxBertForTokenClassification + - call +FlaxBertForQuestionAnswering +[[autodoc]] FlaxBertForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bertweet/chunk_4.txt b/chunked/content_aware_chunking/model_doc_bertweet/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d9f19a09b095fd4317c9007e0933e739cacbc67 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bertweet/chunk_4.txt @@ -0,0 +1 @@ +This model was contributed by dqnguyen. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bertweet/chunk_5.txt b/chunked/content_aware_chunking/model_doc_bertweet/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bertweet/chunk_5.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bertweet/chunk_6.txt b/chunked/content_aware_chunking/model_doc_bertweet/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9ad94e0ccb8e7ffab299f7d0d037a6e92395a6e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bertweet/chunk_6.txt @@ -0,0 +1,11 @@ +Usage example +thon + +import torch +from transformers import AutoModel, AutoTokenizer +bertweet = AutoModel.from_pretrained("vinai/bertweet-base") +For transformers v4.x+: +tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False) +For transformers v3.x: +tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base") +INPUT TWEET IS ALREADY NORMALIZED! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bertweet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_bertweet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..a13fd5761626fcbaed002632ff79573a6df2897f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bertweet/chunk_7.txt @@ -0,0 +1,10 @@ +line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:" +input_ids = torch.tensor([tokenizer.encode(line)]) +with torch.no_grad(): + features = bertweet(input_ids) # Models outputs are now tuples +With TensorFlow 2.0+: +from transformers import TFAutoModel +bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base") + + +This implementation is the same as BERT, except for tokenization method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bertweet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_bertweet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6959c759efd85cd1e83435815d17ae2c202ff946 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bertweet/chunk_8.txt @@ -0,0 +1,2 @@ +Refer to BERT documentation for +API reference information. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bertweet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_bertweet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..c269cbbee71995a581cbed248b2f5fa17daabcfe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bertweet/chunk_9.txt @@ -0,0 +1,2 @@ +BertweetTokenizer +[[autodoc]] BertweetTokenizer \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_10.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb55df81fd043b2f70ca3d9a8997f56d9cd6c130 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_10.txt @@ -0,0 +1,2 @@ +The proposed sparse attention can handle sequences of length up to +8x of what was previously possible using similar hardware. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_11.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..cde357876a08314f138f0a652fbc1cc5fdf6ccbe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_11.txt @@ -0,0 +1,2 @@ +As a consequence of the capability to handle longer context, +BigBird drastically improves performance on various NLP tasks such as question answering and summarization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_12.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..47e895820d9a4c0800ea32a8c5ca02b6d9a8ddd4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_12.txt @@ -0,0 +1,2 @@ +We also +propose novel applications to genomics data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_13.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2ae61b9bdbebd4df7cda526e61361aee3d76d00 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by vasudevgupta. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_14.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f16a20874db68468d36337353044b26ede99569 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_14.txt @@ -0,0 +1,2 @@ +The original code can be found +here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_15.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b29840abfdde567a1e04695a5c341873c983315 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_15.txt @@ -0,0 +1,3 @@ +Usage tips + +For an in-detail explanation on how BigBird's attention works, see this blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_16.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..7aa073a341a447d31ee5d8e04f446a1ae3b9b3af --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_16.txt @@ -0,0 +1 @@ +BigBird comes with 2 implementations: original_full & block_sparse. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_17.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..017dae335090ce5d94995ee04e277701d16e2649 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_17.txt @@ -0,0 +1,2 @@ +For the sequence length < 1024, using + original_full is advised as there is no benefit in using block_sparse attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_18.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..de44a8e4fe291f1715bdead75ebe7651c2db871d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_18.txt @@ -0,0 +1 @@ +The code currently uses window size of 3 blocks and 2 global blocks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_19.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bc6264c298b9b3a155ff6cfda973787d0ec809e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_19.txt @@ -0,0 +1 @@ +Sequence length must be divisible by block size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_20.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..c30deebbfc4f2c73cf80d4d68a19c483847a20ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_20.txt @@ -0,0 +1 @@ +Current implementation supports only ITC. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_21.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..a66abdb0f6dc647ff26994c59a474c5655fb7a67 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_21.txt @@ -0,0 +1,3 @@ +Current implementation doesn't support num_random_blocks = 0 +BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than + the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_22.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..01c487ecd71cde0c15be65b890c626b5105b3ccf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_22.txt @@ -0,0 +1,71 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +BigBirdConfig +[[autodoc]] BigBirdConfig +BigBirdTokenizer +[[autodoc]] BigBirdTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +BigBirdTokenizerFast +[[autodoc]] BigBirdTokenizerFast +BigBird specific outputs +[[autodoc]] models.big_bird.modeling_big_bird.BigBirdForPreTrainingOutput + +BigBirdModel +[[autodoc]] BigBirdModel + - forward +BigBirdForPreTraining +[[autodoc]] BigBirdForPreTraining + - forward +BigBirdForCausalLM +[[autodoc]] BigBirdForCausalLM + - forward +BigBirdForMaskedLM +[[autodoc]] BigBirdForMaskedLM + - forward +BigBirdForSequenceClassification +[[autodoc]] BigBirdForSequenceClassification + - forward +BigBirdForMultipleChoice +[[autodoc]] BigBirdForMultipleChoice + - forward +BigBirdForTokenClassification +[[autodoc]] BigBirdForTokenClassification + - forward +BigBirdForQuestionAnswering +[[autodoc]] BigBirdForQuestionAnswering + - forward + +FlaxBigBirdModel +[[autodoc]] FlaxBigBirdModel + - call +FlaxBigBirdForPreTraining +[[autodoc]] FlaxBigBirdForPreTraining + - call +FlaxBigBirdForCausalLM +[[autodoc]] FlaxBigBirdForCausalLM + - call +FlaxBigBirdForMaskedLM +[[autodoc]] FlaxBigBirdForMaskedLM + - call +FlaxBigBirdForSequenceClassification +[[autodoc]] FlaxBigBirdForSequenceClassification + - call +FlaxBigBirdForMultipleChoice +[[autodoc]] FlaxBigBirdForMultipleChoice + - call +FlaxBigBirdForTokenClassification +[[autodoc]] FlaxBigBirdForTokenClassification + - call +FlaxBigBirdForQuestionAnswering +[[autodoc]] FlaxBigBirdForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_7.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1a43dd4b7b5753701d1be55f6e5d669c20ab401 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_7.txt @@ -0,0 +1,2 @@ +To remedy this, we propose, BigBird, a sparse attention mechanism that +reduces this quadratic dependency to linear. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_8.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..0477d06fba5dd06b5708462e7f4c359e296e06b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_8.txt @@ -0,0 +1,2 @@ +We show that BigBird is a universal approximator of sequence functions and +is Turing complete, thereby preserving these properties of the quadratic, full attention model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_big_bird/chunk_9.txt b/chunked/content_aware_chunking/model_doc_big_bird/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..256f959b6c124549fe6caa5e98a75e89e8ff2d10 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_big_bird/chunk_9.txt @@ -0,0 +1,3 @@ +Along the way, our +theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire +sequence as part of the sparse attention mechanism. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_10.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb55df81fd043b2f70ca3d9a8997f56d9cd6c130 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_10.txt @@ -0,0 +1,2 @@ +The proposed sparse attention can handle sequences of length up to +8x of what was previously possible using similar hardware. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_11.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..cde357876a08314f138f0a652fbc1cc5fdf6ccbe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_11.txt @@ -0,0 +1,2 @@ +As a consequence of the capability to handle longer context, +BigBird drastically improves performance on various NLP tasks such as question answering and summarization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_12.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..47e895820d9a4c0800ea32a8c5ca02b6d9a8ddd4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_12.txt @@ -0,0 +1,2 @@ +We also +propose novel applications to genomics data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_13.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_14.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b29840abfdde567a1e04695a5c341873c983315 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_14.txt @@ -0,0 +1,3 @@ +Usage tips + +For an in-detail explanation on how BigBird's attention works, see this blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_15.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..7aa073a341a447d31ee5d8e04f446a1ae3b9b3af --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_15.txt @@ -0,0 +1 @@ +BigBird comes with 2 implementations: original_full & block_sparse. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_16.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..017dae335090ce5d94995ee04e277701d16e2649 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_16.txt @@ -0,0 +1,2 @@ +For the sequence length < 1024, using + original_full is advised as there is no benefit in using block_sparse attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_17.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..de44a8e4fe291f1715bdead75ebe7651c2db871d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_17.txt @@ -0,0 +1 @@ +The code currently uses window size of 3 blocks and 2 global blocks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_18.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bc6264c298b9b3a155ff6cfda973787d0ec809e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_18.txt @@ -0,0 +1 @@ +Sequence length must be divisible by block size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_19.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..c30deebbfc4f2c73cf80d4d68a19c483847a20ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_19.txt @@ -0,0 +1 @@ +Current implementation supports only ITC. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_20.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..b015d47dbb5a9578adb4142d8d86b4607fd64426 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_20.txt @@ -0,0 +1 @@ +Current implementation doesn't support num_random_blocks = 0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_21.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb073e62efd68f11232b6440c262d2031657f762 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_21.txt @@ -0,0 +1 @@ +BigBirdPegasus uses the PegasusTokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_22.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec60442a93aeb9126195e24d9656466b0317327e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_22.txt @@ -0,0 +1,2 @@ +BigBird is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than + the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_23.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..73235bc1fa8cff371d53229effd7aec0903e8c5c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_23.txt @@ -0,0 +1,26 @@ +Resources + +Text classification task guide +Question answering task guide +Causal language modeling task guide +Translation task guide +Summarization task guide + +BigBirdPegasusConfig +[[autodoc]] BigBirdPegasusConfig + - all +BigBirdPegasusModel +[[autodoc]] BigBirdPegasusModel + - forward +BigBirdPegasusForConditionalGeneration +[[autodoc]] BigBirdPegasusForConditionalGeneration + - forward +BigBirdPegasusForSequenceClassification +[[autodoc]] BigBirdPegasusForSequenceClassification + - forward +BigBirdPegasusForQuestionAnswering +[[autodoc]] BigBirdPegasusForQuestionAnswering + - forward +BigBirdPegasusForCausalLM +[[autodoc]] BigBirdPegasusForCausalLM + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_7.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1a43dd4b7b5753701d1be55f6e5d669c20ab401 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_7.txt @@ -0,0 +1,2 @@ +To remedy this, we propose, BigBird, a sparse attention mechanism that +reduces this quadratic dependency to linear. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_8.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..0477d06fba5dd06b5708462e7f4c359e296e06b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_8.txt @@ -0,0 +1,2 @@ +We show that BigBird is a universal approximator of sequence functions and +is Turing complete, thereby preserving these properties of the quadratic, full attention model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_9.txt b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..256f959b6c124549fe6caa5e98a75e89e8ff2d10 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bigbird_pegasus/chunk_9.txt @@ -0,0 +1,3 @@ +Along the way, our +theoretical analysis reveals some of the benefits of having O(1) global tokens (such as CLS), that attend to the entire +sequence as part of the sparse attention mechanism. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_10.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..80b157e5424805063fa6a44c341e5246f88d0faf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_10.txt @@ -0,0 +1 @@ +Our case study on text generation further demonstrates the advantage of BioGPT on biomedical literature to generate fluent descriptions for biomedical terms. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..50efbbc4a6c76f2a12e3fe92f7cb8d14ecaf07a5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by kamalkraj. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..335acd270796e186613595392dfaaebd5e0647cc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_13.txt @@ -0,0 +1,3 @@ +Usage tips + +BioGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6bd1bc460e770384765132bcec908335650c1e2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_14.txt @@ -0,0 +1 @@ +BioGPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next token in a sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_15.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..aaa5fc14a82ed1e0f85984a72dbbf10f0395c7c8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_15.txt @@ -0,0 +1 @@ +Leveraging this feature allows BioGPT to generate syntactically coherent text as it can be observed in the run_generation.py example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_16.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..36fd57c8796d709de0cd3c7a10017d49b65714c5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_16.txt @@ -0,0 +1 @@ +The model can take the past_key_values (for PyTorch) as input, which is the previously computed key/value attention pairs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_17.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..48d8e5846a54e2f32486cb8b5cf3e4d6c927206e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_17.txt @@ -0,0 +1 @@ +Using this (past_key_values or past) value prevents the model from re-computing pre-computed values in the context of text generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_18.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..134a419276285aaa50bbdf0b142177f10633d0a0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_18.txt @@ -0,0 +1 @@ +For PyTorch, see past_key_values argument of the BioGptForCausalLM.forward() method for more information on its usage. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_19.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fe7255628d9d903b141c80220504c82c095a090 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_19.txt @@ -0,0 +1,21 @@ +Resources + +Causal language modeling task guide + +BioGptConfig +[[autodoc]] BioGptConfig +BioGptTokenizer +[[autodoc]] BioGptTokenizer + - save_vocabulary +BioGptModel +[[autodoc]] BioGptModel + - forward +BioGptForCausalLM +[[autodoc]] BioGptForCausalLM + - forward +BioGptForTokenClassification +[[autodoc]] BioGptForTokenClassification + - forward +BioGptForSequenceClassification +[[autodoc]] BioGptForSequenceClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_8.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..961a624d85fbfac1d7291edbc90c9322c333ceaa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_8.txt @@ -0,0 +1 @@ +We evaluate BioGPT on six biomedical natural language processing tasks and demonstrate that our model outperforms previous models on most tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_biogpt/chunk_9.txt b/chunked/content_aware_chunking/model_doc_biogpt/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..44ba93f5bdebfe0b989a28d5f3c48d0490909f46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_biogpt/chunk_9.txt @@ -0,0 +1 @@ +Especially, we get 44.98%, 38.42% and 40.76% F1 score on BC5CDR, KD-DTI and DDI end-to-end relation extraction tasks, respectively, and 78.2% accuracy on PubMedQA, creating a new record. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_10.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..16246018d840e9eb1ff44f6afd316d01d081953c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_10.txt @@ -0,0 +1 @@ +We conduct detailed analysis of the main components that lead to high transfer performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_11.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_12.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_13.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..53bd32c85599cfe333aa326f1de80ffa3ff8813b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_13.txt @@ -0,0 +1,4 @@ +Usage tips + +BiT models are equivalent to ResNetv2 in terms of architecture, except that: 1) all batch normalization layers are replaced by group normalization, +2) weight standardization is used for convolutional layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_14.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f3d01b5cbba10407a10b7c6f1e303bd5994e922 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_14.txt @@ -0,0 +1,2 @@ +The authors show that the combination of both is useful for training with large batch sizes, and has a significant +impact on transfer learning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_15.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b53d379afde9f6ed0d42b4504346c45f5f0995aa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_15.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BiT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_16.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..36d8889a746ffcacf5105866d3469c83b182d5b1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_16.txt @@ -0,0 +1 @@ +[BitForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_17.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_17.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_18.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_18.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_19.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..721b9f698dca9430f76fee65d11c3e71b586fb46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_19.txt @@ -0,0 +1,11 @@ +BitConfig +[[autodoc]] BitConfig +BitImageProcessor +[[autodoc]] BitImageProcessor + - preprocess +BitModel +[[autodoc]] BitModel + - forward +BitForImageClassification +[[autodoc]] BitForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_6.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d1c1f24edd14a748127324d3f1436c2aa493ae3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_6.txt @@ -0,0 +1 @@ +By combining a few carefully selected components, and transferring using a simple heuristic, we achieve strong performance on over 20 datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_7.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..94ce6ed2ffd15e6decdeb9c07679c456041b73f6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_7.txt @@ -0,0 +1 @@ +BiT performs well across a surprisingly wide range of data regimes -- from 1 example per class to 1M total examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_8.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..33dfd6f67042d7bd5f57c94fb2c25f4803bf8f98 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_8.txt @@ -0,0 +1 @@ +BiT achieves 87.5% top-1 accuracy on ILSVRC-2012, 99.4% on CIFAR-10, and 76.3% on the 19 task Visual Task Adaptation Benchmark (VTAB). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bit/chunk_9.txt b/chunked/content_aware_chunking/model_doc_bit/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cf3808f39943879e687f2fa6a48022d4c099820 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bit/chunk_9.txt @@ -0,0 +1 @@ +On small datasets, BiT attains 76.8% on ILSVRC-2012 with 10 examples per class, and 97.0% on CIFAR-10 with 10 examples per class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_10.txt b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7467ba25cc0e40d6fdce3bb82bbbb2a51022e2c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_10.txt @@ -0,0 +1,2 @@ +The authors' code can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_11.txt b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..9945952f43b8e5129c54b48499d82bc062711428 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_11.txt @@ -0,0 +1,3 @@ +Usage tips +Blenderbot Small is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than +the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_12.txt b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..129bb08b09e1463bc522f034c5da68e2931e0ecb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_12.txt @@ -0,0 +1,44 @@ +Resources + +Causal language modeling task guide +Translation task guide +Summarization task guide + +BlenderbotSmallConfig +[[autodoc]] BlenderbotSmallConfig +BlenderbotSmallTokenizer +[[autodoc]] BlenderbotSmallTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +BlenderbotSmallTokenizerFast +[[autodoc]] BlenderbotSmallTokenizerFast + +BlenderbotSmallModel +[[autodoc]] BlenderbotSmallModel + - forward +BlenderbotSmallForConditionalGeneration +[[autodoc]] BlenderbotSmallForConditionalGeneration + - forward +BlenderbotSmallForCausalLM +[[autodoc]] BlenderbotSmallForCausalLM + - forward + +TFBlenderbotSmallModel +[[autodoc]] TFBlenderbotSmallModel + - call +TFBlenderbotSmallForConditionalGeneration +[[autodoc]] TFBlenderbotSmallForConditionalGeneration + - call + +FlaxBlenderbotSmallModel +[[autodoc]] FlaxBlenderbotSmallModel + - call + - encode + - decode +FlaxBlenderbotForConditionalGeneration +[[autodoc]] FlaxBlenderbotSmallForConditionalGeneration + - call + - encode + - decode \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_5.txt b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..8270566088fd135d50af658e330ecacbcbb70c2b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_5.txt @@ -0,0 +1,2 @@ +We show that large scale models can learn these skills when given appropriate training data and choice of +generation strategy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_6.txt b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..d347ae9c9f2981487e3599594a632c7da95563c2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_6.txt @@ -0,0 +1,2 @@ +We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models +and code publicly available. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_7.txt b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..28e0d42cd99a8f8163cf658afe72b3f63201f511 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_7.txt @@ -0,0 +1,2 @@ +Human evaluations show our best models are superior to existing approaches in multi-turn +dialogue in terms of engagingness and humanness measurements. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_8.txt b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..4db240c006ca81729d1bdbc3b74f8d420fbfeb4a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_8.txt @@ -0,0 +1,2 @@ +We then discuss the limitations of this work by analyzing +failure cases of our models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_9.txt b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8d9a4a80c39655451ecdeff278033b32e2fe584 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot-small/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by patrickvonplaten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_10.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..caa1f0edb48b2fb83839a50b53bfa15dbd0b04b9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_10.txt @@ -0,0 +1,3 @@ +Usage tips and example +Blenderbot is a model with absolute position embeddings so it's usually advised to pad the inputs on the right +rather than the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_11.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc5b105413a1e12445164e154ad3fe2f971edef5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_11.txt @@ -0,0 +1,8 @@ +An example: +thon + +from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration +mname = "facebook/blenderbot-400M-distill" +model = BlenderbotForConditionalGeneration.from_pretrained(mname) +tokenizer = BlenderbotTokenizer.from_pretrained(mname) +UTTERANCE = "My friends are cool but they eat too many carbs." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_12.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce3535ffa7c1ba1aad01c00b5bdd3402dcafcb14 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_12.txt @@ -0,0 +1,4 @@ +inputs = tokenizer([UTTERANCE], return_tensors="pt") +reply_ids = model.generate(**inputs) +print(tokenizer.batch_decode(reply_ids)) +[" That's unfortunate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_13.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..84952e0706c88823935f20015dd59a595c35bfaa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_13.txt @@ -0,0 +1 @@ +Are they trying to lose weight or are they just trying to be healthier?"] \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_14.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c3e4164bc89b8b5e7ebb36a8260829914381c3f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_14.txt @@ -0,0 +1,3 @@ +Implementation Notes + +Blenderbot uses a standard seq2seq model transformer based architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_15.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..01a43b13f773852789a2e277d4b3bb417e62f954 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_15.txt @@ -0,0 +1 @@ +Available checkpoints can be found in the model hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_16.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6dc75c384bc8745fc44aeda4ed26872e4f8204d3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_16.txt @@ -0,0 +1 @@ +This is the default Blenderbot model class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_17.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d2601bf91c0a13d8bfaa524e53aec08af78ebcd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_17.txt @@ -0,0 +1,3 @@ +However, some smaller checkpoints, such as + facebook/blenderbot_small_90M, have a different architecture and consequently should be used with + BlenderbotSmall. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_18.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6886840b997ea28cb5098986f4162f7bf843c7e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_18.txt @@ -0,0 +1,44 @@ +Resources + +Causal language modeling task guide +Translation task guide +Summarization task guide + +BlenderbotConfig +[[autodoc]] BlenderbotConfig +BlenderbotTokenizer +[[autodoc]] BlenderbotTokenizer + - build_inputs_with_special_tokens +BlenderbotTokenizerFast +[[autodoc]] BlenderbotTokenizerFast + - build_inputs_with_special_tokens + +BlenderbotModel +See [~transformers.BartModel] for arguments to forward and generate +[[autodoc]] BlenderbotModel + - forward +BlenderbotForConditionalGeneration +See [~transformers.BartForConditionalGeneration] for arguments to forward and generate +[[autodoc]] BlenderbotForConditionalGeneration + - forward +BlenderbotForCausalLM +[[autodoc]] BlenderbotForCausalLM + - forward + +TFBlenderbotModel +[[autodoc]] TFBlenderbotModel + - call +TFBlenderbotForConditionalGeneration +[[autodoc]] TFBlenderbotForConditionalGeneration + - call + +FlaxBlenderbotModel +[[autodoc]] FlaxBlenderbotModel + - call + - encode + - decode +FlaxBlenderbotForConditionalGeneration +[[autodoc]] FlaxBlenderbotForConditionalGeneration + - call + - encode + - decode \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_7.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..4db240c006ca81729d1bdbc3b74f8d420fbfeb4a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_7.txt @@ -0,0 +1,2 @@ +We then discuss the limitations of this work by analyzing +failure cases of our models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_8.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1058a4cbf916966b2ac412ef8f409ed9bfe50f0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by sshleifer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blenderbot/chunk_9.txt b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..5caf364218a1a88dde90c1bfd0a3a1c1396694e6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blenderbot/chunk_9.txt @@ -0,0 +1 @@ +The authors' code can be found here . \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..a94f17c93e2f5d82b833b417bac406082ba5364f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_10.txt @@ -0,0 +1 @@ +We also demonstrate the model's emerging capabilities of zero-shot image-to-text generation that can follow natural language instructions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..bda2d6a7ca71a9ca95551485e5860f275165801c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_11.txt @@ -0,0 +1 @@ +BLIP-2 architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_12.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc8ff826f34c78c05e9594ef5f8eba4b65ce34ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_15.txt @@ -0,0 +1,3 @@ +Usage tips + +BLIP-2 can be used for conditional text generation given an image and an optional text prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c52770a0652c9de932f4bb456418b261add4d7b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_16.txt @@ -0,0 +1 @@ +At inference time, it's recommended to use the [generate] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_17.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a3a4e4d05cacc2f5e4cefebaaa332f7db88c243 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_17.txt @@ -0,0 +1 @@ +One can use [Blip2Processor] to prepare images for the model, and decode the predicted tokens ID's back to text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_18.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..e92287c4fdfb31c1fe3270fc7553f9c83afbb4f8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_18.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLIP-2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_19.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc74ca107a2f107ff781df5c87e7089d82bb41f0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_19.txt @@ -0,0 +1 @@ +Demo notebooks for BLIP-2 for image captioning, visual question answering (VQA) and chat-like conversations can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_20.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_20.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_21.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_21.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_22.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..d939c4fcc74ff3a7c0a5899e3c2f6a1a5472840d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_22.txt @@ -0,0 +1,25 @@ +Blip2Config +[[autodoc]] Blip2Config + - from_vision_qformer_text_configs +Blip2VisionConfig +[[autodoc]] Blip2VisionConfig +Blip2QFormerConfig +[[autodoc]] Blip2QFormerConfig +Blip2Processor +[[autodoc]] Blip2Processor +Blip2VisionModel +[[autodoc]] Blip2VisionModel + - forward +Blip2QFormerModel +[[autodoc]] Blip2QFormerModel + - forward +Blip2Model +[[autodoc]] Blip2Model + - forward + - get_text_features + - get_image_features + - get_qformer_features +Blip2ForConditionalGeneration +[[autodoc]] Blip2ForConditionalGeneration + - forward + - generate \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_6.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf86ceebbabf94de86c02cba8acffa4094301b30 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_6.txt @@ -0,0 +1 @@ +The first stage bootstraps vision-language representation learning from a frozen image encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_7.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..21d515cff06fc07dbdd00a5234f7fcef58b22e81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_7.txt @@ -0,0 +1 @@ +The second stage bootstraps vision-to-language generative learning from a frozen language model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa42611920bec63c96497cabf4031ff34aef6a94 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_8.txt @@ -0,0 +1 @@ +BLIP-2 achieves state-of-the-art performance on various vision-language tasks, despite having significantly fewer trainable parameters than existing methods. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip-2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_blip-2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d57533f991b5f31deac6838495a76b910b38ff2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip-2/chunk_9.txt @@ -0,0 +1 @@ +For example, our model outperforms Flamingo80B by 8.7% on zero-shot VQAv2 with 54x fewer trainable parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip/chunk_10.txt b/chunked/content_aware_chunking/model_doc_blip/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip/chunk_11.txt b/chunked/content_aware_chunking/model_doc_blip/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c809220ddcca443cd480819638e6a40e0dd2f5ed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip/chunk_11.txt @@ -0,0 +1,58 @@ +Resources + +Jupyter notebook on how to fine-tune BLIP for image captioning on a custom dataset + +BlipConfig +[[autodoc]] BlipConfig + - from_text_vision_configs +BlipTextConfig +[[autodoc]] BlipTextConfig +BlipVisionConfig +[[autodoc]] BlipVisionConfig +BlipProcessor +[[autodoc]] BlipProcessor +BlipImageProcessor +[[autodoc]] BlipImageProcessor + - preprocess + +BlipModel +[[autodoc]] BlipModel + - forward + - get_text_features + - get_image_features +BlipTextModel +[[autodoc]] BlipTextModel + - forward +BlipVisionModel +[[autodoc]] BlipVisionModel + - forward +BlipForConditionalGeneration +[[autodoc]] BlipForConditionalGeneration + - forward +BlipForImageTextRetrieval +[[autodoc]] BlipForImageTextRetrieval + - forward +BlipForQuestionAnswering +[[autodoc]] BlipForQuestionAnswering + - forward + +TFBlipModel +[[autodoc]] TFBlipModel + - call + - get_text_features + - get_image_features +TFBlipTextModel +[[autodoc]] TFBlipTextModel + - call +TFBlipVisionModel +[[autodoc]] TFBlipVisionModel + - call +TFBlipForConditionalGeneration +[[autodoc]] TFBlipForConditionalGeneration + - call +TFBlipForImageTextRetrieval +[[autodoc]] TFBlipForImageTextRetrieval + - call +TFBlipForQuestionAnswering +[[autodoc]] TFBlipForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip/chunk_5.txt b/chunked/content_aware_chunking/model_doc_blip/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a5a5d2cfe00c16e2df4f4ee91dbb746bf3d46f2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip/chunk_5.txt @@ -0,0 +1 @@ +BLIP effectively utilizes the noisy web data by bootstrapping the captions, where a captioner generates synthetic captions and a filter removes the noisy ones. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip/chunk_6.txt b/chunked/content_aware_chunking/model_doc_blip/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8a42db25cc90346d021edcb98ad15c184cd9373 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip/chunk_6.txt @@ -0,0 +1 @@ +We achieve state-of-the-art results on a wide range of vision-language tasks, such as image-text retrieval (+2.7% in average recall@1), image captioning (+2.8% in CIDEr), and VQA (+1.6% in VQA score). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip/chunk_7.txt b/chunked/content_aware_chunking/model_doc_blip/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..1522ca3580a5d40c6b4704541585d87b3aab4f08 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip/chunk_7.txt @@ -0,0 +1 @@ +BLIP also demonstrates strong generalization ability when directly transferred to videolanguage tasks in a zero-shot manner. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip/chunk_8.txt b/chunked/content_aware_chunking/model_doc_blip/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..42166b56a31390a897f604b3a26b289e3b16bca9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip/chunk_8.txt @@ -0,0 +1 @@ +Code, models, and datasets are released. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_blip/chunk_9.txt b/chunked/content_aware_chunking/model_doc_blip/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..36b88f6b94f0e7b9f14885823b777caa6d5a0d5d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_blip/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by ybelkada. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bloom/chunk_10.txt b/chunked/content_aware_chunking/model_doc_bloom/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9b65474a0ebf1120c935495feee2549a84ec285 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bloom/chunk_10.txt @@ -0,0 +1,2 @@ +âš™ï¸ Training +- A blog on The Technology Behind BLOOM Training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bloom/chunk_11.txt b/chunked/content_aware_chunking/model_doc_bloom/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5da385929110c7c40f5749bacf90201488ccca3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bloom/chunk_11.txt @@ -0,0 +1,29 @@ +BloomConfig +[[autodoc]] BloomConfig + - all +BloomTokenizerFast +[[autodoc]] BloomTokenizerFast + - all + +BloomModel +[[autodoc]] BloomModel + - forward +BloomForCausalLM +[[autodoc]] BloomForCausalLM + - forward +BloomForSequenceClassification +[[autodoc]] BloomForSequenceClassification + - forward +BloomForTokenClassification +[[autodoc]] BloomForTokenClassification + - forward +BloomForQuestionAnswering +[[autodoc]] BloomForQuestionAnswering + - forward + +FlaxBloomModel +[[autodoc]] FlaxBloomModel + - call +FlaxBloomForCausalLM +[[autodoc]] FlaxBloomForCausalLM + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bloom/chunk_4.txt b/chunked/content_aware_chunking/model_doc_bloom/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..e75f8ed756bdaba7680a6e809109990e3d35021d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bloom/chunk_4.txt @@ -0,0 +1,11 @@ +BLOOM is available in the following versions: + +bloom-560m +bloom-1b1 +bloom-1b7 +bloom-3b +bloom-7b1 +bloom (176B parameters) + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BLOOM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bloom/chunk_5.txt b/chunked/content_aware_chunking/model_doc_bloom/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bloom/chunk_5.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bloom/chunk_6.txt b/chunked/content_aware_chunking/model_doc_bloom/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bloom/chunk_6.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bloom/chunk_7.txt b/chunked/content_aware_chunking/model_doc_bloom/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..f32d0f8ac0c1ed962c731fab36185f54a9c7899c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bloom/chunk_7.txt @@ -0,0 +1 @@ +[BloomForCausalLM] is supported by this causal language modeling example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bloom/chunk_8.txt b/chunked/content_aware_chunking/model_doc_bloom/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cf528aae513b5ba51f98f31bf933ba85002ae40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bloom/chunk_8.txt @@ -0,0 +1,7 @@ +See also: +- Causal language modeling task guide +- Text classification task guide +- Token classification task guide +- Question answering task guide +âš¡ï¸ Inference +- A blog on Optimization story: Bloom inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bloom/chunk_9.txt b/chunked/content_aware_chunking/model_doc_bloom/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..26a080ef4942ff93ddd919319d18313812b661d5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bloom/chunk_9.txt @@ -0,0 +1 @@ +- A blog on Incredibly Fast BLOOM Inference with DeepSpeed and Accelerate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bort/chunk_10.txt b/chunked/content_aware_chunking/model_doc_bort/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..93cbb4442d1e14bab327a44d82ff911c1a492d16 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bort/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by stefan-it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bort/chunk_11.txt b/chunked/content_aware_chunking/model_doc_bort/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bort/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bort/chunk_12.txt b/chunked/content_aware_chunking/model_doc_bort/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff1cc46b838de6ff53746850a1bba404b9e2026a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bort/chunk_12.txt @@ -0,0 +1,4 @@ +Usage tips + +BORT's model architecture is based on BERT, refer to BERT's documentation page for the + model's API reference as well as usage examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bort/chunk_13.txt b/chunked/content_aware_chunking/model_doc_bort/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..51433ccae4fdc44ef9fdd0b629c4a407c9916ccc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bort/chunk_13.txt @@ -0,0 +1 @@ +BORT uses the RoBERTa tokenizer instead of the BERT tokenizer, refer to RoBERTa's documentation page for the tokenizer's API reference as well as usage examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bort/chunk_14.txt b/chunked/content_aware_chunking/model_doc_bort/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a7e329c8afda7d48b12f1ced1f666cc48569ba4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bort/chunk_14.txt @@ -0,0 +1,2 @@ +BORT requires a specific fine-tuning algorithm, called Agora , + that is sadly not open-sourced yet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bort/chunk_15.txt b/chunked/content_aware_chunking/model_doc_bort/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1812eaf4c0cddc4f2ec505edd09d886aa844141 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bort/chunk_15.txt @@ -0,0 +1,2 @@ +It would be very useful for the community, if someone tries to implement the + algorithm to make BORT fine-tuning work. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bort/chunk_7.txt b/chunked/content_aware_chunking/model_doc_bort/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6221c4eae437a21991ae2e52fbb08f828bc4f2c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bort/chunk_7.txt @@ -0,0 +1,3 @@ +This optimal subset, which we refer to as +"Bort", is demonstrably smaller, having an effective (that is, not counting the embedding layer) size of 5.5% the +original BERT-large architecture, and 16% of the net size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bort/chunk_8.txt b/chunked/content_aware_chunking/model_doc_bort/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d487617a4e816e2a4ed7dd5ea48074de8f3415a9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bort/chunk_8.txt @@ -0,0 +1,4 @@ +Bort is also able to be pretrained in 288 GPU hours, which +is 1.2% of the time required to pretrain the highest-performing BERT parametric architectural variant, RoBERTa-large +(Liu et al., 2019), and about 33% of that of the world-record, in GPU hours, required to train BERT-large on the same +hardware. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bort/chunk_9.txt b/chunked/content_aware_chunking/model_doc_bort/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..c176c9d06cdb51e6e4883bdbf2a85c27da6dc9a8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bort/chunk_9.txt @@ -0,0 +1,3 @@ +It is also 7.9x faster on a CPU, as well as being better performing than other compressed variants of the +architecture, and some of the non-compressed variants: it obtains performance improvements of between 0.3% and 31%, +absolute, with respect to BERT-large, on multiple public natural language understanding (NLU) benchmarks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_10.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..491c79c553ad1431ef007174e2cdf048386f9953 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_10.txt @@ -0,0 +1 @@ +Notably, when further scaling the model, BRIDGETOWER achieves an accuracy of 81.15%, surpassing models that are pre-trained on orders-of-magnitude larger datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_11.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ecf18708ec3eb0cfd8d8b83d11878e82d71f563 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_11.txt @@ -0,0 +1 @@ +BridgeTower architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_12.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_12.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_13.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e855437867357d0d0064e59620cc6294126da68 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by Anahita Bhiwandiwalla, Tiep Le and Shaoyen Tseng. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_14.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_15.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c3574aecb7686bcfd8a061b39df104e5fb57b22 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_15.txt @@ -0,0 +1,2 @@ +Usage tips and examples +BridgeTower consists of a visual encoder, a textual encoder and cross-modal encoder with multiple lightweight bridge layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_16.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d999aff5ebb0012d0a6414e644465e9cfc833f8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_16.txt @@ -0,0 +1 @@ +The goal of this approach was to build a bridge between each uni-modal encoder and the cross-modal encoder to enable comprehensive and detailed interaction at each layer of the cross-modal encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_17.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..e99ccc8eff9ceb751051107613ceba044db8f830 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_17.txt @@ -0,0 +1 @@ +In principle, one can apply any visual, textual or cross-modal encoder in the proposed architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_18.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab12e419659fc74a887d5ff8607258a013439d49 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_18.txt @@ -0,0 +1,2 @@ +The [BridgeTowerProcessor] wraps [RobertaTokenizer] and [BridgeTowerImageProcessor] into a single instance to both +encode the text and prepare the images respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_19.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ca09f6e8e66597b3ae7e697085a5b4d0175f7d8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_19.txt @@ -0,0 +1 @@ +The following example shows how to run contrastive learning using [BridgeTowerProcessor] and [BridgeTowerForContrastiveLearning]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_20.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e599db55556e802a3786b1316e98510421f8a19 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_20.txt @@ -0,0 +1,19 @@ +thon + +from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning +import requests +from PIL import Image +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"] +processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc") +model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc") +forward pass +scores = dict() +for text in texts: + # prepare inputs + encoding = processor(image, text, return_tensors="pt") + outputs = model(**encoding) + scores[text] = outputs + +The following example shows how to run image-text retrieval using [BridgeTowerProcessor] and [BridgeTowerForImageAndTextRetrieval]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_21.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0c9e11c77af62fff29a459f708435469378b844 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_21.txt @@ -0,0 +1,19 @@ +thon + +from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval +import requests +from PIL import Image +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"] +processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") +model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") +forward pass +scores = dict() +for text in texts: + # prepare inputs + encoding = processor(image, text, return_tensors="pt") + outputs = model(**encoding) + scores[text] = outputs.logits[0, 1].item() + +The following example shows how to run masked language modeling using [BridgeTowerProcessor] and [BridgeTowerForMaskedLM]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_22.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..a407438bcd01e4fd5f6c684a20d93513a26c7d94 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_22.txt @@ -0,0 +1,17 @@ +thon + +from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM +from PIL import Image +import requests +url = "http://images.cocodataset.org/val2017/000000360943.jpg" +image = Image.open(requests.get(url, stream=True).raw).convert("RGB") +text = "a looking out of the window" +processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") +model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") +prepare inputs +encoding = processor(image, text, return_tensors="pt") +forward pass +outputs = model(**encoding) +results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist()) +print(results) +.a cat looking out of the window. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_23.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..511e4378ec8a6414337cdc5d2914848bbfd9231b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_23.txt @@ -0,0 +1,3 @@ +Tips: + +This implementation of BridgeTower uses [RobertaTokenizer] to generate text embeddings and OpenAI's CLIP/ViT model to compute visual embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_24.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..156a5b53db7e23d14d560b051e79b0ed0613c57b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_24.txt @@ -0,0 +1 @@ +Checkpoints for pre-trained bridgeTower-base and bridgetower masked language modeling and image text matching are released. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_25.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..df6516d2eeeecc05406c6f9f23d5f1dd50176e38 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_25.txt @@ -0,0 +1 @@ +Please refer to Table 5 for BridgeTower's performance on Image Retrieval and other down stream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_26.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..66304687e043ed5506837682fa9b133716659337 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_26.txt @@ -0,0 +1 @@ +The PyTorch version of this model is only available in torch 1.10 and higher. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_27.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..15f3e4c98625287fd7e8fa2903e82340851b7001 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_27.txt @@ -0,0 +1,24 @@ +BridgeTowerConfig +[[autodoc]] BridgeTowerConfig +BridgeTowerTextConfig +[[autodoc]] BridgeTowerTextConfig +BridgeTowerVisionConfig +[[autodoc]] BridgeTowerVisionConfig +BridgeTowerImageProcessor +[[autodoc]] BridgeTowerImageProcessor + - preprocess +BridgeTowerProcessor +[[autodoc]] BridgeTowerProcessor + - call +BridgeTowerModel +[[autodoc]] BridgeTowerModel + - forward +BridgeTowerForContrastiveLearning +[[autodoc]] BridgeTowerForContrastiveLearning + - forward +BridgeTowerForMaskedLM +[[autodoc]] BridgeTowerForMaskedLM + - forward +BridgeTowerForImageAndTextRetrieval +[[autodoc]] BridgeTowerForImageAndTextRetrieval + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_7.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b1fde3819ddf886f43d8d4d75739b1df595e367 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_7.txt @@ -0,0 +1 @@ +This enables effective bottom-up cross-modal alignment and fusion between visual and textual representations of different semantic levels of pre-trained uni-modal encoders in the cross-modal encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_8.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cd3b1e7221ffea82b889a4c957182e4d29b5bba --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_8.txt @@ -0,0 +1 @@ +Pre-trained with only 4M images, BRIDGETOWER achieves state-of-the-art performance on various downstream vision-language tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bridgetower/chunk_9.txt b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..946a9228c4b1890525fa7d91d9994b3107054d46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bridgetower/chunk_9.txt @@ -0,0 +1 @@ +In particular, on the VQAv2 test-std set, BRIDGETOWER achieves an accuracy of 78.73%, outperforming the previous state-of-the-art model METER by 1.09% with the same pre-training data and almost negligible additional parameters and computational costs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_13.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..221072525ade55323f5f6be66bea359dba982127 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_13.txt @@ -0,0 +1 @@ +BrosForTokenClassification and BrosSpadeEEForTokenClassification essentially perform the same job. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_14.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..c02ecba82a82e72c89caf68b3a500dde4088ba2c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_14.txt @@ -0,0 +1 @@ +However, BrosForTokenClassification assumes input tokens are perfectly serialized (which is very challenging task since they exist in a 2D space), while BrosSpadeEEForTokenClassification allows for more flexibility in handling serialization errors as it predicts next connection tokens from one token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_15.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fb4ca8835df3758e345ba6783f74e54ab352bd7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_15.txt @@ -0,0 +1 @@ +BrosSpadeELForTokenClassification perform the intra-entity linking task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_16.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..836433ff8237267cd1008e2d347b182eab20c637 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_16.txt @@ -0,0 +1 @@ +It predicts relation from one token (of one entity) to another token (of another entity) if these two entities share some relation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_17.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..f98151b5d1fbde6abe14e7ec5b7f01d855f6377c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_17.txt @@ -0,0 +1 @@ +BROS achieves comparable or better result on Key Information Extraction (KIE) benchmarks such as FUNSD, SROIE, CORD and SciTSR, without relying on explicit visual features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_18.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..4af4166cebae824bd01531a505c613ebee48702b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_18.txt @@ -0,0 +1,2 @@ +The abstract from the paper is the following: +Key information extraction (KIE) from document images requires understanding the contextual and spatial semantics of texts in two-dimensional (2D) space. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_19.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab646a4057683d322073d29c89954a0de68a6aab --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_19.txt @@ -0,0 +1 @@ +Many recent studies try to solve the task by developing pre-trained language models focusing on combining visual features from document images with texts and their layout. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_20.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..527cec8ce355f368cf635c4e0d7edb829042d28a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_20.txt @@ -0,0 +1 @@ +On the other hand, this paper tackles the problem by going back to the basic: effective combination of text and layout. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_21.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..09822746a26f73f5ec2a8aa9575e5d4d8dedfe5a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_21.txt @@ -0,0 +1 @@ +Specifically, we propose a pre-trained language model, named BROS (BERT Relying On Spatiality), that encodes relative positions of texts in 2D space and learns from unlabeled documents with area-masking strategy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_22.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6bbe9c0db44204cd640f67b5cba76712936fe22 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_22.txt @@ -0,0 +1 @@ +With this optimized training scheme for understanding texts in 2D space, BROS shows comparable or better performance compared to previous methods on four KIE benchmarks (FUNSD, SROIE, CORD, and SciTSR) without relying on visual features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_23.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..422e941c02df282062c179916b33b7ee227ee65e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_23.txt @@ -0,0 +1 @@ +This paper also reveals two real-world challenges in KIE tasks-(1) minimizing the error from incorrect text ordering and (2) efficient learning from fewer downstream examples-and demonstrates the superiority of BROS over previous methods. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_24.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..6afb9041d7bcdca92ade532dffb3e4fbefd9492c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_24.txt @@ -0,0 +1,2 @@ +* +This model was contributed by jinho8345. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_25.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_25.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_26.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..e869735da6ea1ceddb22625579c1b447f4c4743c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_26.txt @@ -0,0 +1,3 @@ +Usage tips and examples + +[~transformers.BrosModel.forward] requires input_ids and bbox (bounding box). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_27.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..b13554ba9d59cba58f16a002f045f7af099eb7b9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_27.txt @@ -0,0 +1 @@ +Each bounding box should be in (x0, y0, x1, y1) format (top-left corner, bottom-right corner). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_28.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..9778b398ed952767a34e7f5b05b009f57274a098 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_28.txt @@ -0,0 +1 @@ +Obtaining of Bounding boxes depends on external OCR system. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_29.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f5a969eb96b7b40282beb436fae41e3b01d78ca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_29.txt @@ -0,0 +1 @@ +The x coordinate should be normalized by document image width, and the y coordinate should be normalized by document image height. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_30.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3c23457985333c463b35b281c3e7fda1215653d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_30.txt @@ -0,0 +1,8 @@ +thon +def expand_and_normalize_bbox(bboxes, doc_width, doc_height): + # here, bboxes are numpy array +# Normalize bbox -> 0 ~ 1 +bboxes[:, [0, 2]] = bboxes[:, [0, 2]] / width +bboxes[:, [1, 3]] = bboxes[:, [1, 3]] / height + +[~transformers.BrosForTokenClassification.forward, ~transformers.BrosSpadeEEForTokenClassification.forward, ~transformers.BrosSpadeEEForTokenClassification.forward] require not only input_ids and bbox but also box_first_token_mask for loss calculation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_31.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..7005eb9e058aa1956aa8f78a26aaca105bbf88c3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_31.txt @@ -0,0 +1 @@ +It is a mask to filter out non-first tokens of each box. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_32.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5218eeb28a91fef85225eb50d80daed05304063 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_32.txt @@ -0,0 +1 @@ +You can obtain this mask by saving start token indices of bounding boxes when creating input_ids from words. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_33.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..f327c4e3495fb2e3dc07f02d9849eab54f279a34 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_33.txt @@ -0,0 +1,28 @@ +You can make box_first_token_mask with following code, + +thon +def make_box_first_token_mask(bboxes, words, tokenizer, max_seq_length=512): +box_first_token_mask = np.zeros(max_seq_length, dtype=np.bool_) + +# encode(tokenize) each word from words (List[str]) +input_ids_list: List[List[int]] = [tokenizer.encode(e, add_special_tokens=False) for e in words] + +# get the length of each box +tokens_length_list: List[int] = [len(l) for l in input_ids_list] + +box_end_token_indices = np.array(list(itertools.accumulate(tokens_length_list))) +box_start_token_indices = box_end_token_indices - np.array(tokens_length_list) + +# filter out the indices that are out of max_seq_length +box_end_token_indices = box_end_token_indices[box_end_token_indices < max_seq_length - 1] +if len(box_start_token_indices) > len(box_end_token_indices): + box_start_token_indices = box_start_token_indices[: len(box_end_token_indices)] + +# set box_start_token_indices to True +box_first_token_mask[box_start_token_indices] = True + +return box_first_token_mask + +Resources + +Demo scripts can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_bros/chunk_34.txt b/chunked/content_aware_chunking/model_doc_bros/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec642b32f56d9ab7584ca20e17e072f1a492cf87 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_bros/chunk_34.txt @@ -0,0 +1,17 @@ +BrosConfig +[[autodoc]] BrosConfig +BrosProcessor +[[autodoc]] BrosProcessor + - call +BrosModel +[[autodoc]] BrosModel + - forward +BrosForTokenClassification +[[autodoc]] BrosForTokenClassification + - forward +BrosSpadeEEForTokenClassification +[[autodoc]] BrosSpadeEEForTokenClassification + - forward +BrosSpadeELForTokenClassification +[[autodoc]] BrosSpadeELForTokenClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_10.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf67fffa4f5d9001fd37fd419894f4c6591262b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_10.txt @@ -0,0 +1,2 @@ +The original code can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_11.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4d02d2c7421bfe4881470a9e96e05d4a8a114d0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_11.txt @@ -0,0 +1 @@ +ByT5's architecture is based on the T5v1.1 model, refer to T5v1.1's documentation page for the API reference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_12.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..40d8091120964118ac45fc1d812cac3b3b2a213a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_12.txt @@ -0,0 +1,2 @@ +They +only differ in how inputs should be prepared for the model, see the code examples below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_13.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..2074e6d3c4c5f93c7307bc5dde15f0e4198d3c45 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_13.txt @@ -0,0 +1,2 @@ +Since ByT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task +fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_14.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..b02171176dfea8686a381c899b340100c72beb4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_14.txt @@ -0,0 +1 @@ +If you are doing multi-task fine-tuning, you should use a prefix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_15.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbd0d666a50035fba6f5e973df7643c8d05b89ed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_15.txt @@ -0,0 +1,9 @@ +Usage example +ByT5 works on raw UTF-8 bytes, so it can be used without a tokenizer: +thon + +from transformers import T5ForConditionalGeneration +import torch +model = T5ForConditionalGeneration.from_pretrained("google/byt5-small") +num_special_tokens = 3 +Model has 3 special tokens which take up the input ids 0,1,2 of ByT5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_16.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d149a651591fd42781fa5343f7a42d1ff1e1e85a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_16.txt @@ -0,0 +1 @@ +=> Need to shift utf-8 character encodings by 3 before passing ids to model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_17.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..296eea2a50bbed61769a23245937e9fb84d2e41d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_17.txt @@ -0,0 +1 @@ +input_ids = torch.tensor([list("Life is like a box of chocolates. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_18.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bdf995184e18408ff0347eea26dc6c16bf518b3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_18.txt @@ -0,0 +1,2 @@ +".encode("utf-8"))]) + num_special_tokens +labels = torch.tensor([list("La vie est comme une boîte de chocolat. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_19.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..fabea083995188542ea1b1b038c87c105964ca34 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_19.txt @@ -0,0 +1,13 @@ +".encode("utf-8"))]) + num_special_tokens +loss = model(input_ids, labels=labels).loss +loss.item() +2.66 + +For batched inference and training it is however recommended to make use of the tokenizer: +thon + +from transformers import T5ForConditionalGeneration, AutoTokenizer +model = T5ForConditionalGeneration.from_pretrained("google/byt5-small") +tokenizer = AutoTokenizer.from_pretrained("google/byt5-small") +model_inputs = tokenizer( + ["Life is like a box of chocolates. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_20.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..d66a6853ed00257de0490fd4e6b20ca59a343ddb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_20.txt @@ -0,0 +1 @@ +", "Today is Monday. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_21.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..e63f2fa08584c259326188e329d4e59c145c944a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_21.txt @@ -0,0 +1,4 @@ +"], padding="longest", return_tensors="pt" + ) +labels_dict = tokenizer( + ["La vie est comme une boîte de chocolat. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_22.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bbf9d1d71add3eef8e23061bc58c54cc6aa89bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_22.txt @@ -0,0 +1 @@ +", "Aujourd'hui c'est lundi. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_23.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a3b7c3c26d76afeca83ad9e00c4ab466e19d01e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_23.txt @@ -0,0 +1,8 @@ +"], padding="longest", return_tensors="pt" + ) +labels = labels_dict.input_ids +loss = model(**model_inputs, labels=labels).loss +loss.item() +17.9 + +Similar to T5, ByT5 was trained on the span-mask denoising task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_24.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..01a0f37f488eca61650ce15ca9e62d883ff7c118 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_24.txt @@ -0,0 +1,3 @@ +However, +since the model works directly on characters, the pretraining task is a bit +different. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_25.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bd9a5b038589be144d8de8de098332075a682d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_25.txt @@ -0,0 +1,2 @@ +Let's corrupt some characters of the +input sentence "The dog chases a ball in the park." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_26.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b24adc7669d8befb1002c289a945da1e250b63f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_26.txt @@ -0,0 +1,2 @@ +and ask ByT5 to predict them +for us. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_27.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d19d7632070de6cf7f08fde5a2158aaccc934721 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_27.txt @@ -0,0 +1,7 @@ +thon + +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +import torch +tokenizer = AutoTokenizer.from_pretrained("google/byt5-base") +model = AutoModelForSeq2SeqLM.from_pretrained("google/byt5-base") +input_ids_prompt = "The dog chases a ball in the park." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_28.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..19b274bd22bb110f95c290dc8370ad296104e951 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_28.txt @@ -0,0 +1,6 @@ +input_ids = tokenizer(input_ids_prompt).input_ids +Note that we cannot add "{extra_id_}" to the string directly +as the Byte tokenizer would incorrectly merge the tokens +For ByT5, we need to work directly on the character level +Contrary to T5, ByT5 does not use sentinel tokens for masking, but instead +uses final utf character ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_29.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..80bb6dd6496e9f652d452180682a3c56ef0b1e44 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_29.txt @@ -0,0 +1 @@ +UTF-8 is represented by 8 bits and ByT5 has 3 special tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_30.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dc3731b175d3bfeec4bede3477e83787c664952 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_30.txt @@ -0,0 +1 @@ +=> There are 2**8+2 = 259 input ids and mask tokens count down from index 258. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_31.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..d62f9e98268bf09df0c0a8c0ae53850b4ed77ea2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_31.txt @@ -0,0 +1 @@ +=> mask to "The dog [258]a ball [257]park." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_32.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..52c90f5a963f7c3e48ec45051304ed14c64fb628 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_32.txt @@ -0,0 +1,4 @@ +input_ids = torch.tensor([input_ids[:8] + [258] + input_ids[14:21] + [257] + input_ids[28:]]) +input_ids +tensor([[ 87, 107, 104, 35, 103, 114, 106, 35, 258, 35, 100, 35, 101, 100, 111, 111, 257, 35, 115, 100, 117, 110, 49, 1]]) +ByT5 produces only one char at a time so we need to produce many more output characters here -> set max_length=100. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_33.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..019712d18b38768deb98b1761eb64f55007ed6cf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_33.txt @@ -0,0 +1,17 @@ +output_ids = model.generate(input_ids, max_length=100)[0].tolist() +output_ids +[0, 258, 108, 118, 35, 119, 107, 104, 35, 114, 113, 104, 35, 122, 107, 114, 35, 103, 114, 104, 118, 257, 35, 108, 113, 35, 119, 107, 104, 35, 103, 108, 118, 102, 114, 256, 108, 113, 35, 119, 107, 104, 35, 115, 100, 117, 110, 49, 35, 87, 107, 104, 35, 103, 114, 106, 35, 108, 118, 35, 119, 107, 104, 35, 114, 113, 104, 35, 122, 107, 114, 35, 103, 114, 104, 118, 35, 100, 35, 101, 100, 111, 111, 35, 108, 113, 255, 35, 108, 113, 35, 119, 107, 104, 35, 115, 100, 117, 110, 49] +^- Note how 258 descends to 257, 256, 255 +Now we need to split on the sentinel tokens, let's write a short loop for this +output_ids_list = [] +start_token = 0 +sentinel_token = 258 +while sentinel_token in output_ids: + split_idx = output_ids.index(sentinel_token) + output_ids_list.append(output_ids[start_token:split_idx]) + start_token = split_idx + sentinel_token -= 1 +output_ids_list.append(output_ids[start_token:]) +output_string = tokenizer.batch_decode(output_ids_list) +output_string +['', 'is the one who does', ' in the disco', 'in the park. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_34.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5f116bfbdebbd4942ff0f8d94c9adc36b96fd0d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_34.txt @@ -0,0 +1 @@ +The dog is the one who does a ball in', ' in the park.'] \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_byt5/chunk_35.txt b/chunked/content_aware_chunking/model_doc_byt5/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0631a1f11df5e93d74897acd4d6e95e9967094a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_byt5/chunk_35.txt @@ -0,0 +1,3 @@ +ByT5Tokenizer +[[autodoc]] ByT5Tokenizer +See [ByT5Tokenizer] for all details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_camembert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_camembert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..95fa4d23125ad791eece360b7306dcea3d5aef17 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_camembert/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by the ALMAnaCH team (Inria). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_camembert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_camembert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_camembert/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_camembert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_camembert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2152ba69372895fb5a3f3ffb1da38a3a4545792 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_camembert/chunk_12.txt @@ -0,0 +1 @@ +This implementation is the same as RoBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_camembert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_camembert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..24ac62f50474f57f8451730afd362ec763ffc345 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_camembert/chunk_13.txt @@ -0,0 +1,2 @@ +Refer to the documentation of RoBERTa for usage examples as well +as the information relative to the inputs and outputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_camembert/chunk_14.txt b/chunked/content_aware_chunking/model_doc_camembert/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1f2048942e1acaf045b9a58b0613d6e4b918021 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_camembert/chunk_14.txt @@ -0,0 +1,49 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +CamembertConfig +[[autodoc]] CamembertConfig +CamembertTokenizer +[[autodoc]] CamembertTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +CamembertTokenizerFast +[[autodoc]] CamembertTokenizerFast + +CamembertModel +[[autodoc]] CamembertModel +CamembertForCausalLM +[[autodoc]] CamembertForCausalLM +CamembertForMaskedLM +[[autodoc]] CamembertForMaskedLM +CamembertForSequenceClassification +[[autodoc]] CamembertForSequenceClassification +CamembertForMultipleChoice +[[autodoc]] CamembertForMultipleChoice +CamembertForTokenClassification +[[autodoc]] CamembertForTokenClassification +CamembertForQuestionAnswering +[[autodoc]] CamembertForQuestionAnswering + +TFCamembertModel +[[autodoc]] TFCamembertModel +TFCamembertForCasualLM +[[autodoc]] TFCamembertForCausalLM +TFCamembertForMaskedLM +[[autodoc]] TFCamembertForMaskedLM +TFCamembertForSequenceClassification +[[autodoc]] TFCamembertForSequenceClassification +TFCamembertForMultipleChoice +[[autodoc]] TFCamembertForMultipleChoice +TFCamembertForTokenClassification +[[autodoc]] TFCamembertForTokenClassification +TFCamembertForQuestionAnswering +[[autodoc]] TFCamembertForQuestionAnswering \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_camembert/chunk_5.txt b/chunked/content_aware_chunking/model_doc_camembert/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..015907b761575ad9650d81f8776997b0c872f684 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_camembert/chunk_5.txt @@ -0,0 +1,2 @@ +This makes +practical use of such models --in all languages except English-- very limited. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_camembert/chunk_6.txt b/chunked/content_aware_chunking/model_doc_camembert/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..ddee9c664df1c1a21ce79a053d8b08190dc518eb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_camembert/chunk_6.txt @@ -0,0 +1,2 @@ +Aiming to address this issue for French, +we release CamemBERT, a French version of the Bi-directional Encoders for Transformers (BERT). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_camembert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_camembert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2a53426dcc5aee24488bd9ebc4b26a3d901cb85 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_camembert/chunk_7.txt @@ -0,0 +1,3 @@ +We measure the +performance of CamemBERT compared to multilingual models in multiple downstream tasks, namely part-of-speech tagging, +dependency parsing, named-entity recognition, and natural language inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_camembert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_camembert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..da3ecb9752fa22f811f375f61cfa437453a4ef2c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_camembert/chunk_8.txt @@ -0,0 +1,2 @@ +CamemBERT improves the state of the art +for most of the tasks considered. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_camembert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_camembert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..e879a38a31b48ae19054a28999c2041fef1f5b46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_camembert/chunk_9.txt @@ -0,0 +1,2 @@ +We release the pretrained model for CamemBERT hoping to foster research and +downstream applications for French NLP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_10.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_11.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..78dcffbfc4e492d12fb2c1433d6d50c4a1496235 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_11.txt @@ -0,0 +1,4 @@ +Usage tips + +CANINE uses no less than 3 Transformer encoders internally: 2 "shallow" encoders (which only consist of a single + layer) and 1 "deep" encoder (which is a regular BERT encoder). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_12.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ff7b1424bf272b6937d948ddc7d879a8086aab6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_12.txt @@ -0,0 +1,2 @@ +First, a "shallow" encoder is used to contextualize + the character embeddings, using local attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_13.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..91458632710991e1d07aa4e871cb67ebd9d7c5dd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_13.txt @@ -0,0 +1 @@ +Next, after downsampling, a "deep" encoder is applied. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_14.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed53b76333ee867bb0f64d93f6666ae98a253e43 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_14.txt @@ -0,0 +1,2 @@ +Finally, + after upsampling, a "shallow" encoder is used to create the final character embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_15.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef1647009f232a26d68b481d59de819ad038245a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_15.txt @@ -0,0 +1,2 @@ +Details regarding up- and + downsampling can be found in the paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_16.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c6ec4f1775c34f405b569d92cc9b7021b9ed9ec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_16.txt @@ -0,0 +1 @@ +CANINE uses a max sequence length of 2048 characters by default. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_17.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ee4f0b28459935511e147372a0dcec1c9f9c425 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_17.txt @@ -0,0 +1,2 @@ +One can use [CanineTokenizer] + to prepare text for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_18.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b19540813eeff1a6e24a553fae7f6f3ffbae12e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_18.txt @@ -0,0 +1,2 @@ +Classification can be done by placing a linear layer on top of the final hidden state of the special [CLS] token + (which has a predefined Unicode code point). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_19.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..3cf013f2b248ac77e178e78b0e0fe47b661f4a03 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_19.txt @@ -0,0 +1,2 @@ +For token classification tasks however, the downsampled sequence of + tokens needs to be upsampled again to match the length of the original character sequence (which is 2048). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_20.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..6519c3d9759d359c81add22514c2dabc6987d615 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_20.txt @@ -0,0 +1,2 @@ +The + details for this can be found in the paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_21.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1dbc5be3bf2e109364645248efa8458ea1e8dea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_21.txt @@ -0,0 +1,4 @@ +Model checkpoints: + +google/canine-c: Pre-trained with autoregressive character loss, + 12-layer, 768-hidden, 12-heads, 121M parameters (size ~500 MB). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_22.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ee2b738522fd07050198dbb9d338a8d99450bd5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_22.txt @@ -0,0 +1,2 @@ +google/canine-s: Pre-trained with subword loss, 12-layer, + 768-hidden, 12-heads, 121M parameters (size ~500 MB). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_23.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..9736d46382ff3bd8af069f451840ceddb52ed3f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_23.txt @@ -0,0 +1,22 @@ +Usage example +CANINE works on raw characters, so it can be used without a tokenizer: +thon + +from transformers import CanineModel +import torch +model = CanineModel.from_pretrained("google/canine-c") # model pre-trained with autoregressive character loss +text = "hello world" +use Python's built-in ord() function to turn each character into its unicode code point id +input_ids = torch.tensor([[ord(char) for char in text]]) +outputs = model(input_ids) # forward pass +pooled_output = outputs.pooler_output +sequence_output = outputs.last_hidden_state + +For batched inference and training, it is however recommended to make use of the tokenizer (to pad/truncate all +sequences to the same length): +thon + +from transformers import CanineTokenizer, CanineModel +model = CanineModel.from_pretrained("google/canine-c") +tokenizer = CanineTokenizer.from_pretrained("google/canine-c") +inputs = ["Life is like a box of chocolates. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_24.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ded563dd741bd65f3b5c6aa183c99a980a66090 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_24.txt @@ -0,0 +1 @@ +", "You never know what you gonna get."] \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_25.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f79f1d6bf45c111803fd28c3b39d7a5aa620c5f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_25.txt @@ -0,0 +1,36 @@ +encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt") +outputs = model(**encoding) # forward pass +pooled_output = outputs.pooler_output +sequence_output = outputs.last_hidden_state + +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Multiple choice task guide + +CanineConfig +[[autodoc]] CanineConfig +CanineTokenizer +[[autodoc]] CanineTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences +CANINE specific outputs +[[autodoc]] models.canine.modeling_canine.CanineModelOutputWithPooling +CanineModel +[[autodoc]] CanineModel + - forward +CanineForSequenceClassification +[[autodoc]] CanineForSequenceClassification + - forward +CanineForMultipleChoice +[[autodoc]] CanineForMultipleChoice + - forward +CanineForTokenClassification +[[autodoc]] CanineForTokenClassification + - forward +CanineForQuestionAnswering +[[autodoc]] CanineForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_8.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..0723976393074ba44a234b13926c8661d32c1a74 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_8.txt @@ -0,0 +1,2 @@ +CANINE outperforms a comparable mBERT model by +2.8 F1 on TyDi QA, a challenging multilingual benchmark, despite having 28% fewer model parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_canine/chunk_9.txt b/chunked/content_aware_chunking/model_doc_canine/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_canine/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_10.txt b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..b86aa71ceafaa397d06159a90794fe1107e08cb3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_10.txt @@ -0,0 +1 @@ +The Chinese-CLIP model was contributed by OFA-Sys. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_11.txt b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..a83efb23f48c3a3d8ccf9405ac913e4762938b86 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_11.txt @@ -0,0 +1,59 @@ +Usage example +The code snippet below shows how to compute image & text features and similarities: +thon + +from PIL import Image +import requests +from transformers import ChineseCLIPProcessor, ChineseCLIPModel +model = ChineseCLIPModel.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") +processor = ChineseCLIPProcessor.from_pretrained("OFA-Sys/chinese-clip-vit-base-patch16") +url = "https://clip-cn-beijing.oss-cn-beijing.aliyuncs.com/pokemon.jpeg" +image = Image.open(requests.get(url, stream=True).raw) +Squirtle, Bulbasaur, Charmander, Pikachu in English +texts = ["æ°å°¼é¾Ÿ", "妙蛙ç§å­", "å°ç«é¾™", "çš®å¡ä¸˜"] +compute image feature +inputs = processor(images=image, return_tensors="pt") +image_features = model.get_image_features(**inputs) +image_features = image_features / image_features.norm(p=2, dim=-1, keepdim=True) # normalize +compute text features +inputs = processor(text=texts, padding=True, return_tensors="pt") +text_features = model.get_text_features(**inputs) +text_features = text_features / text_features.norm(p=2, dim=-1, keepdim=True) # normalize +compute image-text similarity scores +inputs = processor(text=texts, images=image, return_tensors="pt", padding=True) +outputs = model(**inputs) +logits_per_image = outputs.logits_per_image # this is the image-text similarity score +probs = logits_per_image.softmax(dim=1) # probs: [[1.2686e-03, 5.4499e-02, 6.7968e-04, 9.4355e-01]] + +Currently, following scales of pretrained Chinese-CLIP models are available on 🤗 Hub: + +OFA-Sys/chinese-clip-vit-base-patch16 +OFA-Sys/chinese-clip-vit-large-patch14 +OFA-Sys/chinese-clip-vit-large-patch14-336px +OFA-Sys/chinese-clip-vit-huge-patch14 + +ChineseCLIPConfig +[[autodoc]] ChineseCLIPConfig + - from_text_vision_configs +ChineseCLIPTextConfig +[[autodoc]] ChineseCLIPTextConfig +ChineseCLIPVisionConfig +[[autodoc]] ChineseCLIPVisionConfig +ChineseCLIPImageProcessor +[[autodoc]] ChineseCLIPImageProcessor + - preprocess +ChineseCLIPFeatureExtractor +[[autodoc]] ChineseCLIPFeatureExtractor +ChineseCLIPProcessor +[[autodoc]] ChineseCLIPProcessor +ChineseCLIPModel +[[autodoc]] ChineseCLIPModel + - forward + - get_text_features + - get_image_features +ChineseCLIPTextModel +[[autodoc]] ChineseCLIPTextModel + - forward +ChineseCLIPVisionModel +[[autodoc]] ChineseCLIPVisionModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_5.txt b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4986cfae2eb6577e536531c015a8bd38652df66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_5.txt @@ -0,0 +1 @@ +In this work, we construct a large-scale dataset of image-text pairs in Chinese, where most data are retrieved from publicly available datasets, and we pretrain Chinese CLIP models on the new dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_6.txt b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..a767c795fb0d57ead5f7ea8865428fff86c3fe2f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_6.txt @@ -0,0 +1 @@ +We develop 5 Chinese CLIP models of multiple sizes, spanning from 77 to 958 million parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_7.txt b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b249ec8c9aa157997b6e45a15d2dc9e1d8cbed4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_7.txt @@ -0,0 +1 @@ +Furthermore, we propose a two-stage pretraining method, where the model is first trained with the image encoder frozen and then trained with all parameters being optimized, to achieve enhanced model performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_8.txt b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa346854c0737a297b7292c1018350c5cec7fb5c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_8.txt @@ -0,0 +1 @@ +Our comprehensive experiments demonstrate that Chinese CLIP can achieve the state-of-the-art performance on MUGE, Flickr30K-CN, and COCO-CN in the setups of zero-shot learning and finetuning, and it is able to achieve competitive performance in zero-shot image classification based on the evaluation on the ELEVATER benchmark (Li et al., 2022). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_9.txt b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd15cace2404f67c3c6db8f5b774dc644a59a061 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_chinese_clip/chunk_9.txt @@ -0,0 +1 @@ +Our codes, pretrained models, and demos have been released. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clap/chunk_10.txt b/chunked/content_aware_chunking/model_doc_clap/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd627f7e34b4e0afb2244804617a3784ed2d308e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clap/chunk_10.txt @@ -0,0 +1 @@ +We incorporate the feature fusion mechanism and keyword-to-caption augmentation into the model design to further enable the model to process audio inputs of variable lengths and enhance the performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clap/chunk_11.txt b/chunked/content_aware_chunking/model_doc_clap/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ff1dd91ac16c7b342269819aceab7205ff66879 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clap/chunk_11.txt @@ -0,0 +1 @@ +Third, we perform comprehensive experiments to evaluate our model across three tasks: text-to-audio retrieval, zero-shot audio classification, and supervised audio classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clap/chunk_12.txt b/chunked/content_aware_chunking/model_doc_clap/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3656ac4ad3db6ec422e8aa3e7e4cd50dde6766b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clap/chunk_12.txt @@ -0,0 +1 @@ +The results demonstrate that our model achieves superior performance in text-to-audio retrieval task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clap/chunk_13.txt b/chunked/content_aware_chunking/model_doc_clap/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ea396c10cc1aea42d25ed3359602354a190e96c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clap/chunk_13.txt @@ -0,0 +1 @@ +In audio classification tasks, the model achieves state-of-the-art performance in the zeroshot setting and is able to obtain performance comparable to models' results in the non-zero-shot setting. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clap/chunk_14.txt b/chunked/content_aware_chunking/model_doc_clap/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..14fe58c07145810719da8605a0c9b82d59e70282 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clap/chunk_14.txt @@ -0,0 +1,2 @@ +LAION-Audio-6 +This model was contributed by Younes Belkada and Arthur Zucker . \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clap/chunk_15.txt b/chunked/content_aware_chunking/model_doc_clap/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clap/chunk_15.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clap/chunk_16.txt b/chunked/content_aware_chunking/model_doc_clap/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..13b4bd446cef13414f716b2fece4a8edff7f6859 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clap/chunk_16.txt @@ -0,0 +1,28 @@ +ClapConfig +[[autodoc]] ClapConfig + - from_text_audio_configs +ClapTextConfig +[[autodoc]] ClapTextConfig +ClapAudioConfig +[[autodoc]] ClapAudioConfig +ClapFeatureExtractor +[[autodoc]] ClapFeatureExtractor +ClapProcessor +[[autodoc]] ClapProcessor +ClapModel +[[autodoc]] ClapModel + - forward + - get_text_features + - get_audio_features +ClapTextModel +[[autodoc]] ClapTextModel + - forward +ClapTextModelWithProjection +[[autodoc]] ClapTextModelWithProjection + - forward +ClapAudioModel +[[autodoc]] ClapAudioModel + - forward +ClapAudioModelWithProjection +[[autodoc]] ClapAudioModelWithProjection + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clap/chunk_6.txt b/chunked/content_aware_chunking/model_doc_clap/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9ebe55fa57ef671094293481f95081702cbdfab --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clap/chunk_6.txt @@ -0,0 +1,2 @@ +The abstract from the paper is the following: +Contrastive learning has shown remarkable success in the field of multimodal representation learning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clap/chunk_7.txt b/chunked/content_aware_chunking/model_doc_clap/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..6102045c8f0c7b5427078577c60f9b0b1ba47f4d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clap/chunk_7.txt @@ -0,0 +1 @@ +In this paper, we propose a pipeline of contrastive language-audio pretraining to develop an audio representation by combining audio data with natural language descriptions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clap/chunk_8.txt b/chunked/content_aware_chunking/model_doc_clap/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed72a3f37fce157fee0ed0cd42633eb6c0a0d96b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clap/chunk_8.txt @@ -0,0 +1 @@ +To accomplish this target, we first release LAION-Audio-630K, a large collection of 633,526 audio-text pairs from different data sources. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clap/chunk_9.txt b/chunked/content_aware_chunking/model_doc_clap/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..48224305a31f42d7852c8ae39b2863c72761daed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clap/chunk_9.txt @@ -0,0 +1 @@ +Second, we construct a contrastive language-audio pretraining model by considering different audio encoders and text encoders. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_11.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..971dd2f284db0493d56c596327031dff7a40146f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_11.txt @@ -0,0 +1,2 @@ +We release our code and pre-trained +model weights at this https URL. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_12.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1bd8890a67017e8946e3bb4f5ce5f1b685e242a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by valhalla. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_13.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_14.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f510c8917e6831f28c728bbbecb16065bc9c8fa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_14.txt @@ -0,0 +1,2 @@ +Usage tips and example +CLIP is a multi-modal vision and language model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_15.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e9a2e30384e44dcd5419ed8b51cf3b0efd662a8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_15.txt @@ -0,0 +1,2 @@ +It can be used for image-text similarity and for zero-shot image +classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_16.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..376d2ecfb91885403f8fc97d58a25a414bedb338 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_16.txt @@ -0,0 +1,2 @@ +CLIP uses a ViT like transformer to get visual features and a causal language model to get the text +features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_17.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab29347a1e9ff8b418a939d0435bc5920189e304 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_17.txt @@ -0,0 +1 @@ +Both the text and visual features are then projected to a latent space with identical dimension. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_18.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9d81c7ab2dc4f86d3742a8a4a647b9d5408ebbd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_18.txt @@ -0,0 +1,2 @@ +The dot +product between the projected image and text features is then used as a similar score. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_19.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f9e2d777bd91819099de23f51c6c7cbfbc094ea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_19.txt @@ -0,0 +1,2 @@ +To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches, +which are then linearly embedded. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_20.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..f68ae4c009338b46f96259412267959ff4e93781 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_20.txt @@ -0,0 +1 @@ +A [CLS] token is added to serve as representation of an entire image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_21.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdbaa620c3c182949ec7608b509ebc76149bb765 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_21.txt @@ -0,0 +1,2 @@ +The authors +also add absolute position embeddings, and feed the resulting sequence of vectors to a standard Transformer encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_22.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f7be9a6ebac782160b88b44b40139a1ce54d0d7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_22.txt @@ -0,0 +1 @@ +The [CLIPImageProcessor] can be used to resize (or rescale) and normalize images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_23.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ba208117e508ca7ce196a02f5c0b57a13c748f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_23.txt @@ -0,0 +1 @@ +The [CLIPTokenizer] is used to encode the text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_24.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8615e335cc3366b99a042254481a5bcf09fae490 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_24.txt @@ -0,0 +1,3 @@ +The [CLIPProcessor] wraps +[CLIPImageProcessor] and [CLIPTokenizer] into a single instance to both +encode the text and prepare the images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_25.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..545b8546f57dec0498ac76a3429ebe803ee86cdd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_25.txt @@ -0,0 +1,2 @@ +The following example shows how to get the image-text similarity scores using +[CLIPProcessor] and [CLIPModel]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_26.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e5fd83638457db79c27ab038580d21a361055d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_26.txt @@ -0,0 +1,16 @@ +thon + +from PIL import Image +import requests +from transformers import CLIPProcessor, CLIPModel +model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") +processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +inputs = processor(text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True) +outputs = model(**inputs) +logits_per_image = outputs.logits_per_image # this is the image-text similarity score +probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_27.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..22b8258f470499eec51bd18b989c06bbe04cb258 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_27.txt @@ -0,0 +1 @@ +Fine tuning CLIP with Remote Sensing (Satellite) images and captions, a blog post about how to fine-tune CLIP with RSICD dataset and comparison of performance changes due to data augmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_28.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..335ae8b4d976b6a86f1784f2e1577137ee369d8d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_28.txt @@ -0,0 +1 @@ +This example script shows how to train a CLIP-like vision-text dual encoder model using a pre-trained vision and text encoder using COCO dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_29.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4c727ff85fd1e9c6554fad9bcc3c6d8327ed48e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_29.txt @@ -0,0 +1 @@ +A notebook on how to use a pretrained CLIP for inference with beam search for image captioning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_30.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c3c21dbe9349de4e3963446454accc743c26f4a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_30.txt @@ -0,0 +1,5 @@ +🌎 + +Image retrieval + +A notebook on image retrieval using pretrained CLIP and computing MRR(Mean Reciprocal Rank) score. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_31.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..b34a6346519e0f946848caf6ac310b4754db8b59 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_31.txt @@ -0,0 +1,2 @@ +🌎 +A notebook on image retrieval and showing the similarity score. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_32.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..1cdd0131cfec45f52e3bcfcdcaad87460660f85f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_32.txt @@ -0,0 +1,2 @@ +🌎 +A notebook on how to map images and texts to the same vector space using Multilingual CLIP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_33.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..fac3e7e9fdcc561e257bff757b27ce50c3227976 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_33.txt @@ -0,0 +1,2 @@ +🌎 +A notebook on how to run CLIP on semantic image search using Unsplash and TMDB datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_34.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..225c9c1d48d2940682070fcf3db6a62a1d208ea4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_34.txt @@ -0,0 +1,5 @@ +🌎 + +Explainability + +A notebook on how to visualize similarity between input token and image segment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_35.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea39e40ea6564659ab00a30d7c1c9438bb949b75 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_35.txt @@ -0,0 +1,3 @@ +🌎 + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_36.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_36.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clip/chunk_37.txt b/chunked/content_aware_chunking/model_doc_clip/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..488a32854275fc66295f7cf50f3e42fc062d5acf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clip/chunk_37.txt @@ -0,0 +1,70 @@ +CLIPConfig +[[autodoc]] CLIPConfig + - from_text_vision_configs +CLIPTextConfig +[[autodoc]] CLIPTextConfig +CLIPVisionConfig +[[autodoc]] CLIPVisionConfig +CLIPTokenizer +[[autodoc]] CLIPTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +CLIPTokenizerFast +[[autodoc]] CLIPTokenizerFast +CLIPImageProcessor +[[autodoc]] CLIPImageProcessor + - preprocess +CLIPFeatureExtractor +[[autodoc]] CLIPFeatureExtractor +CLIPProcessor +[[autodoc]] CLIPProcessor + +CLIPModel +[[autodoc]] CLIPModel + - forward + - get_text_features + - get_image_features +CLIPTextModel +[[autodoc]] CLIPTextModel + - forward +CLIPTextModelWithProjection +[[autodoc]] CLIPTextModelWithProjection + - forward +CLIPVisionModelWithProjection +[[autodoc]] CLIPVisionModelWithProjection + - forward +CLIPVisionModel +[[autodoc]] CLIPVisionModel + - forward +CLIPForImageClassification +[[autodoc]] CLIPForImageClassification + - forward + +TFCLIPModel +[[autodoc]] TFCLIPModel + - call + - get_text_features + - get_image_features +TFCLIPTextModel +[[autodoc]] TFCLIPTextModel + - call +TFCLIPVisionModel +[[autodoc]] TFCLIPVisionModel + - call + +FlaxCLIPModel +[[autodoc]] FlaxCLIPModel + - call + - get_text_features + - get_image_features +FlaxCLIPTextModel +[[autodoc]] FlaxCLIPTextModel + - call +FlaxCLIPTextModelWithProjection +[[autodoc]] FlaxCLIPTextModelWithProjection + - call +FlaxCLIPVisionModel +[[autodoc]] FlaxCLIPVisionModel + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_10.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..28c32d4e081c030008453da230799af490c8057c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_10.txt @@ -0,0 +1,4 @@ +This novel hybrid input allows for dynamic adaptation not +only to the three segmentation tasks mentioned above, but +to any binary segmentation task where a text or image query +can be formulated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_11.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..26282283cb6b119be59a9207f5fab62519746b49 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_11.txt @@ -0,0 +1,4 @@ +Finally, we find our system to adapt well +to generalized queries involving affordances or properties + + CLIPSeg overview. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_12.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_12.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_13.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_14.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_15.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..8409180f52e34bdd9bfc1897d83961faa32c3676 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_15.txt @@ -0,0 +1,3 @@ +Usage tips + +[CLIPSegForImageSegmentation] adds a decoder on top of [CLIPSegModel]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_16.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce66cab4e352343ceb2117ad82eee2074cdc2d04 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_16.txt @@ -0,0 +1 @@ +The latter is identical to [CLIPModel]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_17.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..91c316b17b8e92392d0d455037d88a656a8ac1a1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_17.txt @@ -0,0 +1 @@ +[CLIPSegForImageSegmentation] can generate image segmentations based on arbitrary prompts at test time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_18.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfc8413afd5e1c3900f5f5af0f70ec102f30629c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_18.txt @@ -0,0 +1,2 @@ +A prompt can be either a text +(provided to the model as input_ids) or an image (provided to the model as conditional_pixel_values). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_19.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..f10718a35ebf23dee8aada0475ccca68585fe9cf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_19.txt @@ -0,0 +1,2 @@ +One can also provide custom +conditional embeddings (provided to the model as conditional_embeddings). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_20.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ad016df42b3da81c29965659db770ae2a905f95 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_20.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CLIPSeg. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_21.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_21.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_22.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_22.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_23.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..31df212343b2583428cd0dfd24788fa22b7b8224 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_23.txt @@ -0,0 +1 @@ +A notebook that illustrates zero-shot image segmentation with CLIPSeg. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_24.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..b110877b0c174445c15cef1615a832543b64b4ef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_24.txt @@ -0,0 +1,23 @@ +CLIPSegConfig +[[autodoc]] CLIPSegConfig + - from_text_vision_configs +CLIPSegTextConfig +[[autodoc]] CLIPSegTextConfig +CLIPSegVisionConfig +[[autodoc]] CLIPSegVisionConfig +CLIPSegProcessor +[[autodoc]] CLIPSegProcessor +CLIPSegModel +[[autodoc]] CLIPSegModel + - forward + - get_text_features + - get_image_features +CLIPSegTextModel +[[autodoc]] CLIPSegTextModel + - forward +CLIPSegVisionModel +[[autodoc]] CLIPSegVisionModel + - forward +CLIPSegForImageSegmentation +[[autodoc]] CLIPSegForImageSegmentation + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_7.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2cdef1e2faa503d7e5be6cdc5ac16a000920846 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_7.txt @@ -0,0 +1,2 @@ +We build upon the CLIP model as a backbone which we extend with a transformer-based decoder that enables dense +prediction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_8.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..eab648740a74987c0861c8ee9aed901f5edb188c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_8.txt @@ -0,0 +1,3 @@ +After training on an extended version of the +PhraseCut dataset, our system generates a binary segmentation map for an image based on a free-text prompt or on +an additional image expressing the query. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clipseg/chunk_9.txt b/chunked/content_aware_chunking/model_doc_clipseg/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..93a4167b6ee7b2286d9c4d0e2dc308753b01eaaa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clipseg/chunk_9.txt @@ -0,0 +1 @@ +We analyze different variants of the latter image-based prompts in detail. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_10.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..692bd6c1ec0f3187270640630ec588593bd70c36 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_10.txt @@ -0,0 +1 @@ +The use of the [ClvpModelForConditionalGeneration.generate()] method is strongly recommended for tortoise usage. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_11.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c3b06b29f8f14f81944dad9b3463ca42c3dcaa3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_11.txt @@ -0,0 +1 @@ +Note that the CLVP model expects the audio to be sampled at 22.05 kHz contrary to other audio models which expects 16 kHz. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_12.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5a5237d2979052e76e025fb8dd408cc9b5cd92c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_12.txt @@ -0,0 +1,3 @@ +Brief Explanation: + +The [ClvpTokenizer] tokenizes the text input, and the [ClvpFeatureExtractor] extracts the log mel-spectrogram from the desired audio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_13.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..b20aa3ee6641162d1bfac482ad0a601e8ce57849 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_13.txt @@ -0,0 +1 @@ +[ClvpConditioningEncoder] takes those text tokens and audio representations and converts them into embeddings conditioned on the text and audio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_14.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a321518e16823de38563ad9f8045cd6fa4c55361 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_14.txt @@ -0,0 +1 @@ +The [ClvpForCausalLM] uses those embeddings to generate multiple speech candidates. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_15.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2f05a335a45eb33cfb26fe37a7fb7899bd6d4ad --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_15.txt @@ -0,0 +1 @@ +Each speech candidate is passed through the speech encoder ([ClvpEncoder]) which converts them into a vector representation, and the text encoder ([ClvpEncoder]) converts the text tokens into the same latent space. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_16.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b1fd4b78be058b1d8e88f5e71f5626cbba371c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_16.txt @@ -0,0 +1 @@ +At the end, we compare each speech vector with the text vector to see which speech vector is most similar to the text vector. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_17.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..cec351e0ab03db92c82adfb8b61a84f68d1e3e75 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_17.txt @@ -0,0 +1 @@ +[ClvpModelForConditionalGeneration.generate()] compresses all of the logic described above into a single method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_18.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfd382d003a38bb18ba81f015776f47c04558165 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_18.txt @@ -0,0 +1,6 @@ +Example : +thon + +import datasets +from transformers import ClvpProcessor, ClvpModelForConditionalGeneration +Define the Text and Load the Audio (We are taking an audio example from HuggingFace Hub using datasets library). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_19.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..68966372e29c320c10fab899f8967bf7daf622ba --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_19.txt @@ -0,0 +1 @@ +text = "This is an example text." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_20.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bd2b2671cc6c62d480929714028d60ff21f4282 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_20.txt @@ -0,0 +1,4 @@ +ds = datasets.load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +ds = ds.cast_column("audio", datasets.Audio(sampling_rate=22050)) +sample = ds[0]["audio"] +Define processor and model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_21.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..9047b9826141c756d81755529e80343b9ef371d6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_21.txt @@ -0,0 +1,3 @@ +processor = ClvpProcessor.from_pretrained("susnato/clvp_dev") +model = ClvpModelForConditionalGeneration.from_pretrained("susnato/clvp_dev") +Generate processor output and model output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_22.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..beb14f8b55c3858cf6fb56bb870dfb0a99eaf0cd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_22.txt @@ -0,0 +1,35 @@ +processor_output = processor(raw_speech=sample["array"], sampling_rate=sample["sampling_rate"], text=text, return_tensors="pt") +generated_output = model.generate(**processor_output) + +ClvpConfig +[[autodoc]] ClvpConfig + - from_sub_model_configs +ClvpEncoderConfig +[[autodoc]] ClvpEncoderConfig +ClvpDecoderConfig +[[autodoc]] ClvpDecoderConfig +ClvpTokenizer +[[autodoc]] ClvpTokenizer + - save_vocabulary +ClvpFeatureExtractor +[[autodoc]] ClvpFeatureExtractor + - call +ClvpProcessor +[[autodoc]] ClvpProcessor + - call + - decode + - batch_decode +ClvpModelForConditionalGeneration +[[autodoc]] ClvpModelForConditionalGeneration + - forward + - generate + - get_text_features + - get_speech_features +ClvpForCausalLM +[[autodoc]] ClvpForCausalLM +ClvpModel +[[autodoc]] ClvpModel +ClvpEncoder +[[autodoc]] ClvpEncoder +ClvpDecoder +[[autodoc]] ClvpDecoder \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_6.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..da25758841c5a2df6576b576e5390e7ad79a3e28 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_6.txt @@ -0,0 +1 @@ +This model was contributed by Susnato Dhar. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_7.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_7.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_8.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..00f7729beca732af1a81f0e60d215f837ebd10ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_8.txt @@ -0,0 +1,3 @@ +Usage tips + +CLVP is an integral part of the Tortoise TTS model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_clvp/chunk_9.txt b/chunked/content_aware_chunking/model_doc_clvp/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..c70e39ac279d6e926ab1fe6ec83f56bd6876d6e4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_clvp/chunk_9.txt @@ -0,0 +1 @@ +CLVP can be used to compare different generated speech candidates with the provided text, and the best speech tokens are forwarded to the diffusion model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_13.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..b620bacfc8e846ad0ac5cf6e7129d7afed4c62cc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_13.txt @@ -0,0 +1 @@ +This will be picked by default. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_14.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd9ab7f6201f77cb26a41d75587aaaed095fa98e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_14.txt @@ -0,0 +1 @@ +If you want the AutoModel API to cast the load the checkpoints with the storage weights type, you must specify torch_dtype="auto", e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_15.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..eacd4feb8820c73454ea8ab2c52bac1b99fce27f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_15.txt @@ -0,0 +1 @@ +model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto"). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_16.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..e78df6a34bb63c9bf9c9a5d41f7a313fba57ee45 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_16.txt @@ -0,0 +1 @@ +bfloat16: Code Llama was trained with this precision, so we recommend using it for further training or fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_17.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..90338d55d2e1d36aefeca6d64a4300d7a5bac9b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_17.txt @@ -0,0 +1 @@ +float16: We recommend running inference using this precision, as it's usually faster than bfloat16, and evaluation metrics show no discernible degradation with respect to bfloat16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_18.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..97f37ba5e52f58a34fbd052a8a3e0c8a500c878f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_18.txt @@ -0,0 +1 @@ +You can also run inference using bfloat16, and we recommend you check inference results with both float16 and bfloat16 after fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_19.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..46582aa154d5580235322f53588b3986c82233f7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_19.txt @@ -0,0 +1 @@ +As mentioned above, the dtype of the storage weights is mostly irrelevant unless you are using torch_dtype="auto" when initializing a model using. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_20.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..d585e51f4e3277b8d7c20c25d5efb26fa049e329 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_20.txt @@ -0,0 +1 @@ +The reason is that the model will first be downloaded (using the dtype of the checkpoints online) and then will be casted to the default dtype of torch (becomes torch.float32). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_21.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..855f25a97effc5051aaafadb9dc8dcfc9af1aa35 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_21.txt @@ -0,0 +1 @@ +If there is a specified torch_dtype, it will be used instead. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_22.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..815f7d3bc588692570d5f82e0f330baa376c8156 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_22.txt @@ -0,0 +1,2 @@ +Tips: +- The infilling task is supported out of the box. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_23.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9a1e9cb2216bc7e4c896114d1d0bcd3129684ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_23.txt @@ -0,0 +1 @@ +You should be using the tokenizer.fill_token where you want your input to be filled. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_24.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8948cb293e296652b047b54a208cae3a4bba9168 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_24.txt @@ -0,0 +1,7 @@ +- The model conversion script is the same as for the Llama2 family: +Here is a sample usage: + +python src/transformers/models/llama/convert_llama_weights_to_hf.py \ + --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path +Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions +come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_25.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..968941b30d5f0bd3aec11e06b1c6447e034a4d62 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_25.txt @@ -0,0 +1,16 @@ +After conversion, the model and tokenizer can be loaded via: +thon + +from transformers import LlamaForCausalLM, CodeLlamaTokenizer +tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf") +model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf") +PROMPT = '''def remove_non_ascii(s: str) -> str: + """ + return result +''' +input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"] +generated_ids = model.generate(input_ids, max_new_tokens=128) +filling = tokenizer.batch_decode(generated_ids[:, input_ids.shape[1]:], skip_special_tokens = True)[0] +print(PROMPT.replace("", filling)) +def remove_non_ascii(s: str) -> str: + """ Remove non-ASCII characters from a string. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_26.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac3652b414c583de289688ddfdec982e626d2772 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_26.txt @@ -0,0 +1,2 @@ +Args: + s: The string to remove non-ASCII characters from. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_27.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a4f7be0fb5a85e0678750f3d3b042124da384ec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_27.txt @@ -0,0 +1,3 @@ +Returns: + The string with non-ASCII characters removed. +""" \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_28.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..371872a3a163a8bffa8ca184ea186d43ead76fc2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_28.txt @@ -0,0 +1,15 @@ +result = "" +for c in s: + if ord(c) < 128: + result += c +return result + +If you only want the infilled part: +thon + +from transformers import pipeline +import torch +generator = pipeline("text-generation",model="codellama/CodeLlama-7b-hf",torch_dtype=torch.float16, device_map="auto") +generator('def remove_non_ascii(s: str) -> str:\n """ \n return result', max_new_tokens = 128, return_type = 1) + +Under the hood, the tokenizer automatically splits by to create a formatted input string that follows the original training pattern. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_29.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..27cf8ff3be3b1a4916af6e15a7e40b0dbfc425e9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_29.txt @@ -0,0 +1 @@ +This is more robust than preparing the pattern yourself: it avoids pitfalls, such as token glueing, that are very hard to debug. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_30.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..837d203a1a449123443341d9807f984099a2f11a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_30.txt @@ -0,0 +1 @@ +To see how much CPU and GPU memory you need for this model or others, try this calculator which can help determine that value. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_31.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecebfbb96862a6f3c5f508f4de2fe53d06f7d6e2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_31.txt @@ -0,0 +1 @@ +The LLaMA tokenizer is a BPE model based on sentencepiece. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_32.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..374006591f8097ecca75b46bb3007c5fd9fd8426 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_32.txt @@ -0,0 +1 @@ +One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_33.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..62fb268d0481eb7d8c2c42849d091bd15c50e4bf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_33.txt @@ -0,0 +1 @@ +"Banana"), the tokenizer does not prepend the prefix space to the string. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_34.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..317269c5ae4d758dd3a39c9dd4d4477d475895dd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_34.txt @@ -0,0 +1 @@ +Code Llama has the same architecture as the Llama2 models, refer to Llama2's documentation page for the API reference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_35.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..f06a0411716bf700d8d459dc1df3dbb13fe7eea4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_35.txt @@ -0,0 +1 @@ +Find Code Llama tokenizer reference below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_code_llama/chunk_36.txt b/chunked/content_aware_chunking/model_doc_code_llama/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..9af5c8bb0ceac40fe88fb366238503b7083b6905 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_code_llama/chunk_36.txt @@ -0,0 +1,13 @@ +CodeLlamaTokenizer +[[autodoc]] CodeLlamaTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +CodeLlamaTokenizerFast +[[autodoc]] CodeLlamaTokenizerFast + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - update_post_processor + - save_vocabulary \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_10.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..14784abfa7293d4e4e2c654bcca6d1ec9767ac1b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_10.txt @@ -0,0 +1 @@ +In addition, our model CodeGen (with up to 16B parameters trained on TPU-v4) outperforms OpenAI's Codex on the HumanEval benchmark. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_11.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c94eb247233fa927b286501f073163e71415a57f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_11.txt @@ -0,0 +1 @@ +We make the training library JaxFormer including checkpoints available as open source contribution: this https URL. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_12.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..517c20ce680184762adcb67a8c10ec359abaef9f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by Hiroaki Hayashi. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_13.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_14.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5f6fdb5f470e762bbb7e6f50e73d28af2be4f81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_14.txt @@ -0,0 +1,3 @@ +Checkpoint Naming + +CodeGen model checkpoints are available on different pre-training data with variable sizes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_15.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..064beac577170e3cd5ed9cd84085236349590b72 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_15.txt @@ -0,0 +1,8 @@ +The format is: Salesforce/codegen-{size}-{data}, where +size: 350M, 2B, 6B, 16B +data: +nl: Pre-trained on the Pile +multi: Initialized with nl, then further pre-trained on multiple programming languages data +mono: Initialized with multi, then further pre-trained on Python data + +For example, Salesforce/codegen-350M-mono offers a 350 million-parameter checkpoint pre-trained sequentially on the Pile, multiple programming languages, and Python. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_16.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..66f79a55600c443eb82fed872b0c86af6820a491 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_16.txt @@ -0,0 +1,33 @@ +Usage example +thon + +from transformers import AutoModelForCausalLM, AutoTokenizer +checkpoint = "Salesforce/codegen-350M-mono" +model = AutoModelForCausalLM.from_pretrained(checkpoint) +tokenizer = AutoTokenizer.from_pretrained(checkpoint) +text = "def hello_world():" +completion = model.generate(**tokenizer(text, return_tensors="pt")) +print(tokenizer.decode(completion[0])) +def hello_world(): + print("Hello World") + +hello_world() + +Resources + +Causal language modeling task guide + +CodeGenConfig +[[autodoc]] CodeGenConfig + - all +CodeGenTokenizer +[[autodoc]] CodeGenTokenizer + - save_vocabulary +CodeGenTokenizerFast +[[autodoc]] CodeGenTokenizerFast +CodeGenModel +[[autodoc]] CodeGenModel + - forward +CodeGenForCausalLM +[[autodoc]] CodeGenForCausalLM + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_5.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6a999806e11bd8cdb800661993d8e69c0eea6ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_5.txt @@ -0,0 +1 @@ +It treats program synthesis as a sequence prediction problem, in which the specification is expressed in natural language and the desired program is conditionally sampled. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_6.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..008b188595b26a4941870321198036bc30667667 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_6.txt @@ -0,0 +1 @@ +We train a family of large language models, called CodeGen, on natural language and programming language data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_7.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..35c714b8012bddbeb9b082bd5be074f32c16768c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_7.txt @@ -0,0 +1 @@ +With weak supervision in the data and the scaling up of data size and model size, conversational capacities emerge from the simple autoregressive language modeling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_8.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..819910d2078fb4bbcc0e968fe32e98e768f05d8e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_8.txt @@ -0,0 +1 @@ +To study the model behavior on conversational program synthesis, we develop a multi-turn programming benchmark (MTPB), where solving each problem requires multi-step synthesis via multi-turn conversation between the user and the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_codegen/chunk_9.txt b/chunked/content_aware_chunking/model_doc_codegen/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..55bb30e47452d94edb74339ca28cb8d841e23dac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_codegen/chunk_9.txt @@ -0,0 +1 @@ +Our findings show the emergence of conversational capabilities and the effectiveness of the proposed conversational program synthesis paradigm. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_10.txt b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd3b49d0b8859305f4ace5439cf98b0b7122bba4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_10.txt @@ -0,0 +1 @@ +Code is available at https://github.com/Atten4Vis/ConditionalDETR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_11.txt b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae7378514604433c43aa04157f8f744f459d9243 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_11.txt @@ -0,0 +1 @@ +Conditional DETR shows much faster convergence compared to the original DETR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_12.txt b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_12.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_13.txt b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..58121b59c20d8ae11c797d407dec4d1255fc9cb0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by DepuMeng. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_14.txt b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_15.txt b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..2841d1e17763f6fc88e6caa7101dd0800f1112b1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_15.txt @@ -0,0 +1,29 @@ +Resources + +Object detection task guide + +ConditionalDetrConfig +[[autodoc]] ConditionalDetrConfig +ConditionalDetrImageProcessor +[[autodoc]] ConditionalDetrImageProcessor + - preprocess + - post_process_object_detection + - post_process_instance_segmentation + - post_process_semantic_segmentation + - post_process_panoptic_segmentation +ConditionalDetrFeatureExtractor +[[autodoc]] ConditionalDetrFeatureExtractor + - call + - post_process_object_detection + - post_process_instance_segmentation + - post_process_semantic_segmentation + - post_process_panoptic_segmentation +ConditionalDetrModel +[[autodoc]] ConditionalDetrModel + - forward +ConditionalDetrForObjectDetection +[[autodoc]] ConditionalDetrForObjectDetection + - forward +ConditionalDetrForSegmentation +[[autodoc]] ConditionalDetrForSegmentation + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_6.txt b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..81374545f274598e0994137fc1f335b442ee4135 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_6.txt @@ -0,0 +1 @@ +Our approach, named conditional DETR, learns a conditional spatial query from the decoder embedding for decoder multi-head cross-attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_7.txt b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c684fa696c7168412eebb2b9af6b89e4758451b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_7.txt @@ -0,0 +1 @@ +The benefit is that through the conditional spatial query, each cross-attention head is able to attend to a band containing a distinct region, e.g., one object extremity or a region inside the object box. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_8.txt b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..42d2a27e5e20ce666a852e0caa3db85c7cf96548 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_8.txt @@ -0,0 +1 @@ +This narrows down the spatial range for localizing the distinct regions for object classification and box regression, thus relaxing the dependence on the content embeddings and easing the training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_9.txt b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b37c4652352eda7db69f6952088014131dbadc7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_conditional_detr/chunk_9.txt @@ -0,0 +1 @@ +Empirical results show that conditional DETR converges 6.7× faster for the backbones R50 and R101 and 10× faster for stronger backbones DC5-R50 and DC5-R101. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convbert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_convbert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..69a5cc9ece399bb949f4c4713a3de97e3e2ccdc3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convbert/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by abhishek. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convbert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_convbert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8f9a93fa76c7573b6b20a174db992aed85cfa64 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convbert/chunk_11.txt @@ -0,0 +1,4 @@ +The original implementation can be found +here: https://github.com/yitu-opensource/ConvBert +Usage tips +ConvBERT training tips are similar to those of BERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convbert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_convbert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..98519bd6d490b0a0fb6256b443dc12ffff504cd3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convbert/chunk_12.txt @@ -0,0 +1 @@ +For usage tips refer to BERT documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convbert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_convbert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c77a7db074eec516b03970021badd5510f8dbdf0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convbert/chunk_13.txt @@ -0,0 +1,56 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +ConvBertConfig +[[autodoc]] ConvBertConfig +ConvBertTokenizer +[[autodoc]] ConvBertTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +ConvBertTokenizerFast +[[autodoc]] ConvBertTokenizerFast + +ConvBertModel +[[autodoc]] ConvBertModel + - forward +ConvBertForMaskedLM +[[autodoc]] ConvBertForMaskedLM + - forward +ConvBertForSequenceClassification +[[autodoc]] ConvBertForSequenceClassification + - forward +ConvBertForMultipleChoice +[[autodoc]] ConvBertForMultipleChoice + - forward +ConvBertForTokenClassification +[[autodoc]] ConvBertForTokenClassification + - forward +ConvBertForQuestionAnswering +[[autodoc]] ConvBertForQuestionAnswering + - forward + +TFConvBertModel +[[autodoc]] TFConvBertModel + - call +TFConvBertForMaskedLM +[[autodoc]] TFConvBertForMaskedLM + - call +TFConvBertForSequenceClassification +[[autodoc]] TFConvBertForSequenceClassification + - call +TFConvBertForMultipleChoice +[[autodoc]] TFConvBertForMultipleChoice + - call +TFConvBertForTokenClassification +[[autodoc]] TFConvBertForTokenClassification + - call +TFConvBertForQuestionAnswering +[[autodoc]] TFConvBertForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convbert/chunk_5.txt b/chunked/content_aware_chunking/model_doc_convbert/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba99dc1a7e22394a4bd5ff83bd81f19e02caac9b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convbert/chunk_5.txt @@ -0,0 +1,3 @@ +The novel convolution heads, together with the +rest self-attention heads, form a new mixed attention block that is more efficient at both global and local context +learning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convbert/chunk_6.txt b/chunked/content_aware_chunking/model_doc_convbert/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..d914ab25416269e4bb9a10d9763c379dd873c6ea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convbert/chunk_6.txt @@ -0,0 +1 @@ +We equip BERT with this mixed attention design and build a ConvBERT model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convbert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_convbert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..814c6e1c77c0d1a6b9157a8f45c40c26c8d84ffe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convbert/chunk_7.txt @@ -0,0 +1,3 @@ +Experiments have shown that +ConvBERT significantly outperforms BERT and its variants in various downstream tasks, with lower training cost and +fewer model parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convbert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_convbert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a5d5c6df3dfca96029e636d8d68bd3595e26548 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convbert/chunk_8.txt @@ -0,0 +1,2 @@ +Remarkably, ConvBERTbase model achieves 86.4 GLUE score, 0.7 higher than ELECTRAbase, while +using less than 1/4 training cost. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convbert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_convbert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..81b6e84600382d6d1b68722fc9dbf2e9eef133b1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convbert/chunk_9.txt @@ -0,0 +1 @@ +Code and pre-trained models will be released. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_10.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ea46e071b8c7ea0ce4913ab5c4151f9d6f5c62d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_10.txt @@ -0,0 +1 @@ +ConvNeXT architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_11.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_11.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_12.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_13.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b0c3d8e5d4a5bd4cd50bab4e01b8abc87f128c8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_13.txt @@ -0,0 +1,2 @@ +TensorFlow version of the model was contributed by ariG23498, +gante, and sayakpaul (equal contribution). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_14.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_15.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0a0cb8249f0d4eaa86635e3b7f2855d85630e2e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_15.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ConvNeXT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_16.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cf2df51113e540bc8d1da9962b487bd8dffb994 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_16.txt @@ -0,0 +1 @@ +[ConvNextForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_17.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_17.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_18.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_18.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_19.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..1550a8e167da7ac2cd1a1d3fc6be4ec9f6d0f267 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_19.txt @@ -0,0 +1,21 @@ +ConvNextConfig +[[autodoc]] ConvNextConfig +ConvNextFeatureExtractor +[[autodoc]] ConvNextFeatureExtractor +ConvNextImageProcessor +[[autodoc]] ConvNextImageProcessor + - preprocess + +ConvNextModel +[[autodoc]] ConvNextModel + - forward +ConvNextForImageClassification +[[autodoc]] ConvNextForImageClassification + - forward + +TFConvNextModel +[[autodoc]] TFConvNextModel + - call +TFConvNextForImageClassification +[[autodoc]] TFConvNextForImageClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_7.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..a91b73392cc32017846ed888f25cf37ae356f7b5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_7.txt @@ -0,0 +1,2 @@ +We gradually "modernize" a standard ResNet toward the design +of a vision Transformer, and discover several key components that contribute to the performance difference along the way. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_8.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..babe5679590fc1361ae92c767429c7cafd9ff540 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_8.txt @@ -0,0 +1,2 @@ +The outcome of this exploration is a family of pure ConvNet models +dubbed ConvNeXt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnext/chunk_9.txt b/chunked/content_aware_chunking/model_doc_convnext/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..c537df12995d4324945fcd34703915f53dffa23a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnext/chunk_9.txt @@ -0,0 +1,2 @@ +Constructed entirely from standard ConvNet modules, ConvNeXts compete favorably with Transformers in terms of accuracy and scalability, achieving 87.8% ImageNet top-1 accuracy +and outperforming Swin Transformers on COCO detection and ADE20K segmentation, while maintaining the simplicity and efficiency of standard ConvNets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnextv2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_10.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnextv2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..773cf31a6bf61e91bd6c3fb37484f259b8192082 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by adirik. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnextv2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnextv2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..faac2d512eacef194921d020ee8c61aaa2b7d29a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_13.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ConvNeXt V2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnextv2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..458e18bceded97f4e9b44ea4b602159d5cb2a707 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_14.txt @@ -0,0 +1 @@ +[ConvNextV2ForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnextv2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_15.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnextv2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_16.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnextv2/chunk_17.txt b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b258033285436d16f7d8c675598d306148e240a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_17.txt @@ -0,0 +1,14 @@ +ConvNextV2Config +[[autodoc]] ConvNextV2Config +ConvNextV2Model +[[autodoc]] ConvNextV2Model + - forward +ConvNextV2ForImageClassification +[[autodoc]] ConvNextV2ForImageClassification + - forward +TFConvNextV2Model +[[autodoc]] TFConvNextV2Model + - call +TFConvNextV2ForImageClassification +[[autodoc]] TFConvNextV2ForImageClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnextv2/chunk_7.txt b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c90df6cd3dd7ae7a155a7591eb894cddd383f600 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_7.txt @@ -0,0 +1 @@ +This co-design of self-supervised learning techniques and architectural improvement results in a new model family called ConvNeXt V2, which significantly improves the performance of pure ConvNets on various recognition benchmarks, including ImageNet classification, COCO detection, and ADE20K segmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnextv2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f83181ef5a81bf2ccabce93a3e4770b80275f48 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_8.txt @@ -0,0 +1 @@ +We also provide pre-trained ConvNeXt V2 models of various sizes, ranging from an efficient 3.7M-parameter Atto model with 76.7% top-1 accuracy on ImageNet, to a 650M Huge model that achieves a state-of-the-art 88.9% accuracy using only public training data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_convnextv2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..602da8c7734d4f2aecf5475139909885dcb002fa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_convnextv2/chunk_9.txt @@ -0,0 +1 @@ +ConvNeXt V2 architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpm/chunk_10.txt b/chunked/content_aware_chunking/model_doc_cpm/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad000ceba3d792ee5cf20b9d9cffe386771c12c9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpm/chunk_10.txt @@ -0,0 +1,4 @@ +CpmTokenizer +[[autodoc]] CpmTokenizer +CpmTokenizerFast +[[autodoc]] CpmTokenizerFast \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpm/chunk_5.txt b/chunked/content_aware_chunking/model_doc_cpm/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..4737681d550de79e50b48707b8b5ef8baac61336 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpm/chunk_5.txt @@ -0,0 +1,4 @@ +To the best +of our knowledge, CPM, with 2.6 billion parameters and 100GB Chinese training data, is the largest Chinese pre-trained +language model, which could facilitate several downstream Chinese NLP tasks, such as conversation, essay generation, +cloze test, and language understanding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpm/chunk_6.txt b/chunked/content_aware_chunking/model_doc_cpm/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9df2c59dece96ec73cac6d47838524047c09485 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpm/chunk_6.txt @@ -0,0 +1,2 @@ +Extensive experiments demonstrate that CPM achieves strong performance on many +NLP tasks in the settings of few-shot (even zero-shot) learning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpm/chunk_7.txt b/chunked/content_aware_chunking/model_doc_cpm/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..87615c31640a1a851dea6650f3c5001796d0ba36 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpm/chunk_7.txt @@ -0,0 +1 @@ +This model was contributed by canwenxu. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpm/chunk_8.txt b/chunked/content_aware_chunking/model_doc_cpm/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..42fc2c1de559b53899acdfd2e03233c46c9848d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpm/chunk_8.txt @@ -0,0 +1,4 @@ +The original implementation can be found +here: https://github.com/TsinghuaAI/CPM-Generate + +CPM's architecture is the same as GPT-2, except for tokenization method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpm/chunk_9.txt b/chunked/content_aware_chunking/model_doc_cpm/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c69c7c48e391712ff946d3b07d0a7d91f4e7fdf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpm/chunk_9.txt @@ -0,0 +1,2 @@ +Refer to GPT-2 documentation for +API reference information. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpmant/chunk_2.txt b/chunked/content_aware_chunking/model_doc_cpmant/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff089821ac77ee07db2420b96baafdb5507376d1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpmant/chunk_2.txt @@ -0,0 +1 @@ +The training process is cost-effective and environment-friendly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpmant/chunk_3.txt b/chunked/content_aware_chunking/model_doc_cpmant/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ec634be2f548bd7b63a7cca8a62410f45019c5a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpmant/chunk_3.txt @@ -0,0 +1 @@ +CPM-Ant also achieves promising results with delta tuning on the CUGE benchmark. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpmant/chunk_4.txt b/chunked/content_aware_chunking/model_doc_cpmant/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6a440b346f9a88b38b5a6b40f541db2f5124185 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpmant/chunk_4.txt @@ -0,0 +1 @@ +Besides the full model, we also provide various compressed versions to meet the requirements of different hardware configurations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpmant/chunk_5.txt b/chunked/content_aware_chunking/model_doc_cpmant/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fe58adddab98b4c494c3767296ecd3f75922af0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpmant/chunk_5.txt @@ -0,0 +1,2 @@ +See more +This model was contributed by OpenBMB. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpmant/chunk_6.txt b/chunked/content_aware_chunking/model_doc_cpmant/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpmant/chunk_6.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpmant/chunk_7.txt b/chunked/content_aware_chunking/model_doc_cpmant/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..585eec081273021e67a47c3df0baf1871e7e3ec3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpmant/chunk_7.txt @@ -0,0 +1,3 @@ +Resources + +A tutorial on CPM-Live. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cpmant/chunk_8.txt b/chunked/content_aware_chunking/model_doc_cpmant/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e0a72532e8e086a0306fed639025c9d21d7eb93 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cpmant/chunk_8.txt @@ -0,0 +1,12 @@ +CpmAntConfig +[[autodoc]] CpmAntConfig + - all +CpmAntTokenizer +[[autodoc]] CpmAntTokenizer + - all +CpmAntModel +[[autodoc]] CpmAntModel + - all +CpmAntForCausalLM +[[autodoc]] CpmAntForCausalLM + - all \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_10.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..71ce61054bb0f053fc5c15e349e2239e9c066653 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_10.txt @@ -0,0 +1,2 @@ +Refer to the original implementation for + more information. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_11.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..816de400c93a891df59a20fb97e6d610c4f9d0ad --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_11.txt @@ -0,0 +1,2 @@ +CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than + the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_12.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..5acea912fdba73f478b575cc61caab0a688620ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_12.txt @@ -0,0 +1,2 @@ +CTRL was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next + token in a sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_13.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..89200868d71bbf088007e3ac178fba719ae48aa7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_13.txt @@ -0,0 +1,2 @@ +Leveraging this feature allows CTRL to generate syntactically coherent text as it can be + observed in the run_generation.py example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_14.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..232806d13859c80a3662558c4fb594705d6d1909 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_14.txt @@ -0,0 +1 @@ +The PyTorch models can take the past_key_values as input, which is the previously computed key/value attention pairs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_15.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..26f3c04f4aa86e5345787613fedad3d87a7c9dd3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_15.txt @@ -0,0 +1 @@ +TensorFlow models accepts past as input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_16.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..09564d82251d88b27c74f704939cb1f07b8c6b54 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_16.txt @@ -0,0 +1,2 @@ +Using the past_key_values value prevents the model from re-computing + pre-computed values in the context of text generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_17.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..b20818e05bf91d493ed573e5f4a8390975ac6f3f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_17.txt @@ -0,0 +1,2 @@ +See the forward + method for more information on the usage of this argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_18.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..22a631162e2819aefe66e0fb7e8066db5e7e42fd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_18.txt @@ -0,0 +1,30 @@ +Resources + +Text classification task guide +Causal language modeling task guide + +CTRLConfig +[[autodoc]] CTRLConfig +CTRLTokenizer +[[autodoc]] CTRLTokenizer + - save_vocabulary + +CTRLModel +[[autodoc]] CTRLModel + - forward +CTRLLMHeadModel +[[autodoc]] CTRLLMHeadModel + - forward +CTRLForSequenceClassification +[[autodoc]] CTRLForSequenceClassification + - forward + +TFCTRLModel +[[autodoc]] TFCTRLModel + - call +TFCTRLLMHeadModel +[[autodoc]] TFCTRLLMHeadModel + - call +TFCTRLForSequenceClassification +[[autodoc]] TFCTRLForSequenceClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_7.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f342acc2d50daf48b13d86c01e48590d3c63bea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_7.txt @@ -0,0 +1 @@ +This model was contributed by keskarnitishr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_8.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f16a20874db68468d36337353044b26ede99569 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_8.txt @@ -0,0 +1,2 @@ +The original code can be found +here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ctrl/chunk_9.txt b/chunked/content_aware_chunking/model_doc_ctrl/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb8e01202c7718cc789ac37d8f01868f7a8359a6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ctrl/chunk_9.txt @@ -0,0 +1,4 @@ +Usage tips + +CTRL makes use of control codes to generate text: it requires generations to be started by certain words, sentences + or links to generate coherent text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_10.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cbbf61f256f3f82894221aea8294691df533df5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_11.txt @@ -0,0 +1,3 @@ +Usage tips + +CvT models are regular Vision Transformers, but trained with convolutions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..1aa6fa44390e1ba10ccdbb0c57cf3f8f6985e6b5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_12.txt @@ -0,0 +1 @@ +They outperform the original model (ViT) when fine-tuned on ImageNet-1K and CIFAR-100. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..37c4b5cd964c9b9598a463c1cc9c482346bb8079 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_13.txt @@ -0,0 +1 @@ +You can check out demo notebooks regarding inference as well as fine-tuning on custom data here (you can just replace [ViTFeatureExtractor] by [AutoImageProcessor] and [ViTForImageClassification] by [CvtForImageClassification]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..9110822cc92bacf734084468b3b69263c637c875 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_14.txt @@ -0,0 +1,2 @@ +The available checkpoints are either (1) pre-trained on ImageNet-22k (a collection of 14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on ImageNet-1k (also referred to as ILSVRC 2012, a collection of 1.3 million + images and 1,000 classes). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_15.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..800744d196482d707437ab5ee6bdfcc41e63a3d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_15.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with CvT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_16.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..51d69c38408210b245ebd5f5ca083cdfa86e635e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_16.txt @@ -0,0 +1 @@ +[CvtForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_17.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_17.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_18.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_18.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_19.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..f967770b81b53352e981fcb02f32ed121c23b8b7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_19.txt @@ -0,0 +1,16 @@ +CvtConfig +[[autodoc]] CvtConfig + +CvtModel +[[autodoc]] CvtModel + - forward +CvtForImageClassification +[[autodoc]] CvtForImageClassification + - forward + +TFCvtModel +[[autodoc]] TFCvtModel + - call +TFCvtForImageClassification +[[autodoc]] TFCvtForImageClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_6.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..8baae4df5a4cde45a19c0a498ff1fc7a46464567 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_6.txt @@ -0,0 +1,2 @@ +In addition, +performance gains are maintained when pretrained on larger datasets (\eg ImageNet-22k) and fine-tuned to downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_7.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b465c269ea72926c8987b51500725b335ec1f4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_7.txt @@ -0,0 +1,2 @@ +Pre-trained on +ImageNet-22k, our CvT-W24 obtains a top-1 accuracy of 87.7\% on the ImageNet-1k val set. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_8.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d0b25aaaae8252f04636e1ead52b725a179ac92 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_8.txt @@ -0,0 +1,2 @@ +Finally, our results show that the positional encoding, +a crucial component in existing Vision Transformers, can be safely removed in our model, simplifying the design for higher resolution vision tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_cvt/chunk_9.txt b/chunked/content_aware_chunking/model_doc_cvt/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..48ec3b90e5f98587018baf8373027433c9d221d6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_cvt/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by anugunj. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_10.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd55c03b4f0c74f05bd171014f07cae60f31d43e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_10.txt @@ -0,0 +1 @@ +sayakpaul and Rocketknight1 contributed Data2Vec for vision in TensorFlow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_11.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..91300d9a71c4cd6edc5c65a21969ef4e4e682cf2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_11.txt @@ -0,0 +1 @@ +The original code (for NLP and Speech) can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_12.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3ba1d3a008308efec2751fbcb9f0d32eb6f65c2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_12.txt @@ -0,0 +1 @@ +The original code for vision can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_13.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa4a3783d128b5d5ea64932a42ccdfe87941e328 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_13.txt @@ -0,0 +1,3 @@ +Usage tips + +Data2VecAudio, Data2VecText, and Data2VecVision have all been trained using the same self-supervised learning method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_14.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a8953b6ccee95cdad52590153204869ff9ba7c8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_14.txt @@ -0,0 +1,2 @@ +For Data2VecAudio, preprocessing is identical to [Wav2Vec2Model], including feature extraction +For Data2VecText, preprocessing is identical to [RobertaModel], including tokenization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_15.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..e641225b7baa3c0b9c73ff3386cc4e4f6c0f5b2b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_15.txt @@ -0,0 +1 @@ +For Data2VecVision, preprocessing is identical to [BeitModel], including feature extraction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_16.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..54bbb93fa06c1940de7a526740a227df962c2722 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_16.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Data2Vec. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_17.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..893d44e645c48a77e2f6df7bbb33e6488b7cee60 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_17.txt @@ -0,0 +1 @@ +[Data2VecVisionForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_18.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..794c830593037875ce4110b6e07faa6b6c09d026 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_18.txt @@ -0,0 +1 @@ +To fine-tune [TFData2VecVisionForImageClassification] on a custom dataset, see this notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_19.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..2050a56501b9a046a39948a3c467137701d046b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_19.txt @@ -0,0 +1,14 @@ +Data2VecText documentation resources +- Text classification task guide +- Token classification task guide +- Question answering task guide +- Causal language modeling task guide +- Masked language modeling task guide +- Multiple choice task guide +Data2VecAudio documentation resources +- Audio classification task guide +- Automatic speech recognition task guide +Data2VecVision documentation resources +- Image classification +- Semantic segmentation +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_20.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_20.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_21.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e2a89c3f5f35f7b37b7f935a0ccb5e373ef306d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_21.txt @@ -0,0 +1,62 @@ +Data2VecTextConfig +[[autodoc]] Data2VecTextConfig +Data2VecAudioConfig +[[autodoc]] Data2VecAudioConfig +Data2VecVisionConfig +[[autodoc]] Data2VecVisionConfig + +Data2VecAudioModel +[[autodoc]] Data2VecAudioModel + - forward +Data2VecAudioForAudioFrameClassification +[[autodoc]] Data2VecAudioForAudioFrameClassification + - forward +Data2VecAudioForCTC +[[autodoc]] Data2VecAudioForCTC + - forward +Data2VecAudioForSequenceClassification +[[autodoc]] Data2VecAudioForSequenceClassification + - forward +Data2VecAudioForXVector +[[autodoc]] Data2VecAudioForXVector + - forward +Data2VecTextModel +[[autodoc]] Data2VecTextModel + - forward +Data2VecTextForCausalLM +[[autodoc]] Data2VecTextForCausalLM + - forward +Data2VecTextForMaskedLM +[[autodoc]] Data2VecTextForMaskedLM + - forward +Data2VecTextForSequenceClassification +[[autodoc]] Data2VecTextForSequenceClassification + - forward +Data2VecTextForMultipleChoice +[[autodoc]] Data2VecTextForMultipleChoice + - forward +Data2VecTextForTokenClassification +[[autodoc]] Data2VecTextForTokenClassification + - forward +Data2VecTextForQuestionAnswering +[[autodoc]] Data2VecTextForQuestionAnswering + - forward +Data2VecVisionModel +[[autodoc]] Data2VecVisionModel + - forward +Data2VecVisionForImageClassification +[[autodoc]] Data2VecVisionForImageClassification + - forward +Data2VecVisionForSemanticSegmentation +[[autodoc]] Data2VecVisionForSemanticSegmentation + - forward + +TFData2VecVisionModel +[[autodoc]] TFData2VecVisionModel + - call +TFData2VecVisionForImageClassification +[[autodoc]] TFData2VecVisionForImageClassification + - call +TFData2VecVisionForSemanticSegmentation +[[autodoc]] TFData2VecVisionForSemanticSegmentation + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_5.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1458a39075a1ee04844f30328c93400680256af --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_5.txt @@ -0,0 +1,2 @@ +The core idea is to predict latent representations of the full input data based on a +masked view of the input in a selfdistillation setup using a standard Transformer architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_6.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2105d4f7bcd481967dc251ccc3cbc31c27a209d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_6.txt @@ -0,0 +1,3 @@ +Instead of predicting modality-specific targets such as words, visual tokens or units of human speech which +are local in nature, data2vec predicts contextualized latent representations that contain information from +the entire input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_7.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..da2a12712ce1832ce3369c9f75c99e4549afda32 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_7.txt @@ -0,0 +1,2 @@ +Experiments on the major benchmarks of speech recognition, image classification, and +natural language understanding demonstrate a new state of the art or competitive performance to predominant approaches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_8.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1e4c010be824d93603a36b180ffd5c1b1b287cd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_8.txt @@ -0,0 +1 @@ +Models and code are available at www.github.com/pytorch/fairseq/tree/master/examples/data2vec. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_data2vec/chunk_9.txt b/chunked/content_aware_chunking/model_doc_data2vec/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..663f88aa61233b56b287baec2d679020c02d165a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_data2vec/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by edugp and patrickvonplaten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..346cf548ecf3b0b4b774c73c8b7bd04fe38cf573 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_10.txt @@ -0,0 +1 @@ +DeBERTa v2 is the second version of the DeBERTa model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6b63e20d5cf5e6465907e1b0c62da1852f8ca87 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_11.txt @@ -0,0 +1,2 @@ +It includes +the 1.5B model used for the SuperGLUE single-model submission and achieving 89.9, versus human baseline 89.8. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..db66fc95308e99bdefb91d0b9e6a8bb75407ac8a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_12.txt @@ -0,0 +1,6 @@ +You can +find more details about this submission in the authors' +blog +New in v2: + +Vocabulary In v2 the tokenizer is changed to use a new vocabulary of size 128K built from the training data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..821e5dbf47d3f7c97ca9acb4e2364310b3243620 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_13.txt @@ -0,0 +1,2 @@ +Instead of a GPT2-based tokenizer, the tokenizer is now + sentencepiece-based tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..d98133ce82715368b02a0db2608f29119957eca0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_14.txt @@ -0,0 +1,2 @@ +nGiE(nGram Induced Input Encoding) The DeBERTa-v2 model uses an additional convolution layer aside with the first + transformer layer to better learn the local dependency of input tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..944fe0ae1a8370a3879a51d0fb7e62c8e508b04e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_15.txt @@ -0,0 +1,2 @@ +Sharing position projection matrix with content projection matrix in attention layer Based on previous + experiments, this can save parameters without affecting the performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..f036b88250b367114c629994aad3218533845c72 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_16.txt @@ -0,0 +1,2 @@ +Apply bucket to encode relative positions The DeBERTa-v2 model uses log bucket to encode relative positions + similar to T5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_17.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..66444bfd2f5ddf9820fdcdacc7c48eb39ef0e4f3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_17.txt @@ -0,0 +1,2 @@ +900M model & 1.5B model Two additional model sizes are available: 900M and 1.5B, which significantly improves the + performance of downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_18.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e8d32054f9736d5e3e502c34e70908d52618395 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_18.txt @@ -0,0 +1 @@ +This model was contributed by DeBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_19.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..71807c0b852a994b1be11fcc02b7ab6fef0d8df0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_19.txt @@ -0,0 +1,2 @@ +This model TF 2.0 implementation was +contributed by kamalkraj. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_20.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_20.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_21.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a6fc580fb315ecc89cedc65259b37dfe29d95e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_21.txt @@ -0,0 +1,64 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +DebertaV2Config +[[autodoc]] DebertaV2Config +DebertaV2Tokenizer +[[autodoc]] DebertaV2Tokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +DebertaV2TokenizerFast +[[autodoc]] DebertaV2TokenizerFast + - build_inputs_with_special_tokens + - create_token_type_ids_from_sequences + +DebertaV2Model +[[autodoc]] DebertaV2Model + - forward +DebertaV2PreTrainedModel +[[autodoc]] DebertaV2PreTrainedModel + - forward +DebertaV2ForMaskedLM +[[autodoc]] DebertaV2ForMaskedLM + - forward +DebertaV2ForSequenceClassification +[[autodoc]] DebertaV2ForSequenceClassification + - forward +DebertaV2ForTokenClassification +[[autodoc]] DebertaV2ForTokenClassification + - forward +DebertaV2ForQuestionAnswering +[[autodoc]] DebertaV2ForQuestionAnswering + - forward +DebertaV2ForMultipleChoice +[[autodoc]] DebertaV2ForMultipleChoice + - forward + +TFDebertaV2Model +[[autodoc]] TFDebertaV2Model + - call +TFDebertaV2PreTrainedModel +[[autodoc]] TFDebertaV2PreTrainedModel + - call +TFDebertaV2ForMaskedLM +[[autodoc]] TFDebertaV2ForMaskedLM + - call +TFDebertaV2ForSequenceClassification +[[autodoc]] TFDebertaV2ForSequenceClassification + - call +TFDebertaV2ForTokenClassification +[[autodoc]] TFDebertaV2ForTokenClassification + - call +TFDebertaV2ForQuestionAnswering +[[autodoc]] TFDebertaV2ForQuestionAnswering + - call +TFDebertaV2ForMultipleChoice +[[autodoc]] TFDebertaV2ForMultipleChoice + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3e87fc52a1b1b8192c059e76d1ae34e5db3afb4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_8.txt @@ -0,0 +1,2 @@ +The DeBERTa code and +pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a15d39936b49ba7d6e3c8a1424bda77a62bb286 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta-v2/chunk_9.txt @@ -0,0 +1,2 @@ +The following information is visible directly on the original implementation +repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_10.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..996b7c4b18a2b582e58f055de2093a066359c053 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_10.txt @@ -0,0 +1,2 @@ +This model TF 2.0 implementation was +contributed by kamalkraj . \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_11.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_12.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed72cc86765dc32aac30362a2d73aa099d1cdaed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_12.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DeBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_13.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_13.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_14.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_14.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_15.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..95719700540c99d05cd682947743e131c5414f9f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_15.txt @@ -0,0 +1 @@ +A blog post on how to Accelerate Large Model Training using DeepSpeed with DeBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_16.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..1db26066965c4a7c339085bdc732e8ff97426982 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_16.txt @@ -0,0 +1 @@ +A blog post on Supercharged Customer Service with Machine Learning with DeBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_17.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..20044e9297fe4b2675ab69fa0b08dc97088bf915 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_17.txt @@ -0,0 +1 @@ +[DebertaForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_18.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..59a658f7f75c0b610cd856eb9ee7736085bd0cde --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_18.txt @@ -0,0 +1 @@ +[TFDebertaForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_19.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..caa65bc0b846afec6b116c90e7fa8e97faafcdad --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_19.txt @@ -0,0 +1,3 @@ +Text classification task guide + +[DebertaForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_20.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..abce0505c7a7ae35ee0953c68ed20fc78c296f5e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_20.txt @@ -0,0 +1 @@ +[TFDebertaForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_21.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..69e21faf2c5098fb807509f480ff122a6a2859c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_21.txt @@ -0,0 +1 @@ +Token classification chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_22.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfc0150b49018f65ce896229fc6500e24993f60b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_22.txt @@ -0,0 +1 @@ +Byte-Pair Encoding tokenization chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_23.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..3cbc209a1be984a03e87f4b29a4be80f807c47dd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_23.txt @@ -0,0 +1,3 @@ +Token classification task guide + +[DebertaForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_24.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4c31ab59248acdfb23cc043c176d8c56ec516c5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_24.txt @@ -0,0 +1 @@ +[TFDebertaForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_25.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f2b5fefece97efd08b6147d0c598a5443817bec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_25.txt @@ -0,0 +1 @@ +Masked language modeling chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_26.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fa766de35bdced52fd1c38d66bc9b5cf24194a4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_26.txt @@ -0,0 +1,3 @@ +Masked language modeling task guide + +[DebertaForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_27.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..92a95808d1600edb6b2a64d54fd121634efc9f45 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_27.txt @@ -0,0 +1 @@ +[TFDebertaForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_28.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..38996d3f4fef4d6454d1d2c12acfb05d3bf81ec8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_28.txt @@ -0,0 +1 @@ +Question answering chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_29.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6273313f8d3d7dece4332711d8b5ba05a1b691b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_29.txt @@ -0,0 +1,51 @@ +Question answering task guide + +DebertaConfig +[[autodoc]] DebertaConfig +DebertaTokenizer +[[autodoc]] DebertaTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +DebertaTokenizerFast +[[autodoc]] DebertaTokenizerFast + - build_inputs_with_special_tokens + - create_token_type_ids_from_sequences + +DebertaModel +[[autodoc]] DebertaModel + - forward +DebertaPreTrainedModel +[[autodoc]] DebertaPreTrainedModel +DebertaForMaskedLM +[[autodoc]] DebertaForMaskedLM + - forward +DebertaForSequenceClassification +[[autodoc]] DebertaForSequenceClassification + - forward +DebertaForTokenClassification +[[autodoc]] DebertaForTokenClassification + - forward +DebertaForQuestionAnswering +[[autodoc]] DebertaForQuestionAnswering + - forward + +TFDebertaModel +[[autodoc]] TFDebertaModel + - call +TFDebertaPreTrainedModel +[[autodoc]] TFDebertaPreTrainedModel + - call +TFDebertaForMaskedLM +[[autodoc]] TFDebertaForMaskedLM + - call +TFDebertaForSequenceClassification +[[autodoc]] TFDebertaForSequenceClassification + - call +TFDebertaForTokenClassification +[[autodoc]] TFDebertaForTokenClassification + - call +TFDebertaForQuestionAnswering +[[autodoc]] TFDebertaForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_6.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..282fc2953b7727786d5deaf47a71f015280baf53 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_6.txt @@ -0,0 +1,2 @@ +We show that these two techniques significantly improve the efficiency +of model pretraining and performance of downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_7.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..47dc59d325310b2b24e825cc56cb6652ad8a05e7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_7.txt @@ -0,0 +1,3 @@ +Compared to RoBERTa-Large, a DeBERTa model trained on half of +the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9% +(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_8.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3e87fc52a1b1b8192c059e76d1ae34e5db3afb4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_8.txt @@ -0,0 +1,2 @@ +The DeBERTa code and +pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deberta/chunk_9.txt b/chunked/content_aware_chunking/model_doc_deberta/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e8d32054f9736d5e3e502c34e70908d52618395 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deberta/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by DeBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cf9e9a859803b16dfb236ada87f62f02afbad40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_10.txt @@ -0,0 +1,8 @@ +DecisionTransformerConfig +[[autodoc]] DecisionTransformerConfig +DecisionTransformerGPT2Model +[[autodoc]] DecisionTransformerGPT2Model + - forward +DecisionTransformerModel +[[autodoc]] DecisionTransformerModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_4.txt b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..02c609f2ddf7286f2fdb3f30cc49be4dd80a2c88 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_4.txt @@ -0,0 +1,3 @@ +Unlike prior approaches to RL that fit value functions or + compute policy gradients, Decision Transformer simply outputs the optimal actions by leveraging a causally masked + Transformer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_5.txt b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..0460389c331e7338538f0b0e5dc8b446a3649f4d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_5.txt @@ -0,0 +1,2 @@ +By conditioning an autoregressive model on the desired return (reward), past states, and actions, our + Decision Transformer model can generate future actions that achieve the desired return. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_6.txt b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8103b97a4f229733a999cb9acd3af5bd29a3c30 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_6.txt @@ -0,0 +1,3 @@ +Despite its simplicity, + Decision Transformer matches or exceeds the performance of state-of-the-art model-free offline RL baselines on + Atari, OpenAI Gym, and Key-to-Door tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..07b0cdaa8cb9131035f7a3ea5caf05d84b9ef19d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_7.txt @@ -0,0 +1 @@ +This version of the model is for tasks where the state is a vector. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d72235f9a70372c7a80c05a948c9a49008e0ca4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by edbeeching. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_decision_transformer/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_10.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_11.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d271d19fd696f5f12ce147614f536f48d98278ea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_11.txt @@ -0,0 +1,3 @@ +Usage tips + +Training Deformable DETR is equivalent to training the original DETR model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_12.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c66e0561b0cc9117c23f28ce5fb7ada0a75812b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_12.txt @@ -0,0 +1 @@ +See the resources section below for demo notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_13.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..903da08952877bdab8c2ddaf4dd0c12a55f9165e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_13.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Deformable DETR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_14.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5e7df9eb57b2fc35ada36747d63f4789e178988 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_14.txt @@ -0,0 +1 @@ +Demo notebooks regarding inference + fine-tuning on a custom dataset for [DeformableDetrForObjectDetection] can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_15.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..5bdfaebbc7bf820669bc24b5e0fc0cc5d2a2dd99 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_15.txt @@ -0,0 +1 @@ +See also: Object detection task guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_16.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_16.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_17.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_17.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_18.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca7fd25434edefc6d9c7adec0020e1678530eb19 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_18.txt @@ -0,0 +1,16 @@ +DeformableDetrImageProcessor +[[autodoc]] DeformableDetrImageProcessor + - preprocess + - post_process_object_detection +DeformableDetrFeatureExtractor +[[autodoc]] DeformableDetrFeatureExtractor + - call + - post_process_object_detection +DeformableDetrConfig +[[autodoc]] DeformableDetrConfig +DeformableDetrModel +[[autodoc]] DeformableDetrModel + - forward +DeformableDetrForObjectDetection +[[autodoc]] DeformableDetrForObjectDetection + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_5.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..e23df9f21b6114a8748bcb439433cf98dfaf61ea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_5.txt @@ -0,0 +1 @@ +Deformable DETR can achieve better performance than DETR (especially on small objects) with 10 times less training epochs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_6.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..62edfd5bb3a7d07f0d58ade41b4a050250eb6296 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_6.txt @@ -0,0 +1 @@ +Extensive experiments on the COCO benchmark demonstrate the effectiveness of our approach. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_7.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a88422f9754920a112b78f3f38ea72043dac843 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_7.txt @@ -0,0 +1 @@ +Deformable DETR architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_8.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_8.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_9.txt b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deformable_detr/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_13.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..165270b9bc3ce28c5c4e1d309cc17f684ff3cedc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_13.txt @@ -0,0 +1,2 @@ +We share our code and +models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_14.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_14.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_15.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c9cd9b48231b2e76f2f2941303830dca79b6041 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_15.txt @@ -0,0 +1 @@ +The TensorFlow version of this model was added by amyeroberts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_16.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e119892868798c288fedadb996336782d4cd8d4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_16.txt @@ -0,0 +1,4 @@ +Usage tips + +Compared to ViT, DeiT models use a so-called distillation token to effectively learn from a teacher (which, in the + DeiT paper, is a ResNet like-model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_17.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f97e42c2be1a4f0b92b57f45717368a57028d55 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_17.txt @@ -0,0 +1,2 @@ +The distillation token is learned through backpropagation, by interacting with + the class ([CLS]) and patch tokens through the self-attention layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_18.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..25b65565d80380fe851305b2088209b54ab3b70e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_18.txt @@ -0,0 +1,3 @@ +There are 2 ways to fine-tune distilled models, either (1) in a classic way, by only placing a prediction head on top + of the final hidden state of the class token and not using the distillation signal, or (2) by placing both a + prediction head on top of the class token and on top of the distillation token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_19.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc04bcd69e620ecaedf132f7239bce7c3f194bba --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_19.txt @@ -0,0 +1,4 @@ +In that case, the [CLS] prediction + head is trained using regular cross-entropy between the prediction of the head and the ground-truth label, while the + distillation prediction head is trained using hard distillation (cross-entropy between the prediction of the + distillation head and the label predicted by the teacher). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_20.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..878f97c11b2f18555c62e61cc6ff3f4ac4c0e5bf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_20.txt @@ -0,0 +1,2 @@ +At inference time, one takes the average prediction + between both heads as final prediction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_21.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6df48086eaa014cfa8f3a3bf333915c2639d3d0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_21.txt @@ -0,0 +1,2 @@ +(2) is also called "fine-tuning with distillation", because one relies on a + teacher that has already been fine-tuned on the downstream dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_22.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4ec1d7f0302ff18ba2a7b59f273d01e9f8acef8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_22.txt @@ -0,0 +1,3 @@ +In terms of models, (1) corresponds to + [DeiTForImageClassification] and (2) corresponds to + [DeiTForImageClassificationWithTeacher]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_23.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..463fc7c535e9b5562cdad61280ee00176bd30cb8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_23.txt @@ -0,0 +1,2 @@ +Note that the authors also did try soft distillation for (2) (in which case the distillation prediction head is + trained using KL divergence to match the softmax output of the teacher), but hard distillation gave the best results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_24.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f1c715e8b6e9f9b4d78f550e1c4f392504bf557 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_24.txt @@ -0,0 +1 @@ +All released checkpoints were pre-trained and fine-tuned on ImageNet-1k only. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_25.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfc40976a4f63372d853e16085cb220b2f4d1934 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_25.txt @@ -0,0 +1 @@ +No external data was used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_26.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..45a644e2560959b3b63c5f91326d28a38584ee13 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_26.txt @@ -0,0 +1,3 @@ +This is in + contrast with the original ViT model, which used external data like the JFT-300M dataset/Imagenet-21k for + pre-training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_27.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..46343a89899a6f76c35de811887c6c5006a1c1cd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_27.txt @@ -0,0 +1,2 @@ +The authors of DeiT also released more efficiently trained ViT models, which you can directly plug into + [ViTModel] or [ViTForImageClassification]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_28.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..322e55b47e6f1e57fcc3e14fc1b2a4e1db325f29 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_28.txt @@ -0,0 +1,3 @@ +Techniques like data + augmentation, optimization, and regularization were used in order to simulate training on a much larger dataset + (while only using ImageNet-1k for pre-training). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_29.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..9425b786a86e58f86434323b4f401d14609e8963 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_29.txt @@ -0,0 +1,3 @@ +There are 4 variants available (in 3 different sizes): + facebook/deit-tiny-patch16-224, facebook/deit-small-patch16-224, facebook/deit-base-patch16-224 and + facebook/deit-base-patch16-384. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_30.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..af9837db61bcf23aa3340cc625ec40dd0a523572 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_30.txt @@ -0,0 +1,2 @@ +Note that one should use [DeiTImageProcessor] in order to + prepare images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_31.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..97277bd314a3d86cc3fd19b1314cc44cf9379fc3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_31.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DeiT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_32.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d4f7bf4b13ba441a95c7591a4f2ae33e2bac928 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_32.txt @@ -0,0 +1 @@ +[DeiTForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_33.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..365fdcd2c8e4c1b4cb7019bcb702d3b21b8b8cca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_33.txt @@ -0,0 +1,5 @@ +See also: Image classification task guide + +Besides that: + +[DeiTForMaskedImageModeling] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_34.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_34.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_35.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_35.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deit/chunk_36.txt b/chunked/content_aware_chunking/model_doc_deit/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..71051e58ce4373bc942b4a0b0cb6f31a18c8f4de --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deit/chunk_36.txt @@ -0,0 +1,34 @@ +DeiTConfig +[[autodoc]] DeiTConfig +DeiTFeatureExtractor +[[autodoc]] DeiTFeatureExtractor + - call +DeiTImageProcessor +[[autodoc]] DeiTImageProcessor + - preprocess + +DeiTModel +[[autodoc]] DeiTModel + - forward +DeiTForMaskedImageModeling +[[autodoc]] DeiTForMaskedImageModeling + - forward +DeiTForImageClassification +[[autodoc]] DeiTForImageClassification + - forward +DeiTForImageClassificationWithTeacher +[[autodoc]] DeiTForImageClassificationWithTeacher + - forward + +TFDeiTModel +[[autodoc]] TFDeiTModel + - call +TFDeiTForMaskedImageModeling +[[autodoc]] TFDeiTForMaskedImageModeling + - call +TFDeiTForImageClassification +[[autodoc]] TFDeiTForImageClassification + - call +TFDeiTForImageClassificationWithTeacher +[[autodoc]] TFDeiTForImageClassificationWithTeacher + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deplot/chunk_10.txt b/chunked/content_aware_chunking/model_doc_deplot/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..72fb90360ef9e89cafb41eb3a5d45491e3c1852f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deplot/chunk_10.txt @@ -0,0 +1 @@ +Compared with a SOTA model finetuned on more than >28k data points, DePlot+LLM with just one-shot prompting achieves a 24.0% improvement over finetuned SOTA on human-written queries from the task of chart QA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deplot/chunk_11.txt b/chunked/content_aware_chunking/model_doc_deplot/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..88026d30d1e5a3a47e09664c87102af7a8848191 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deplot/chunk_11.txt @@ -0,0 +1 @@ +DePlot is a model that is trained using Pix2Struct architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deplot/chunk_12.txt b/chunked/content_aware_chunking/model_doc_deplot/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f459d4af0391502ac736dcfeac7be5f782281cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deplot/chunk_12.txt @@ -0,0 +1 @@ +You can find more information about Pix2Struct in the Pix2Struct documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deplot/chunk_13.txt b/chunked/content_aware_chunking/model_doc_deplot/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..80379ccf3687f2d2f6ee87beecd4469aa6e272d8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deplot/chunk_13.txt @@ -0,0 +1 @@ +DePlot is a Visual Question Answering subset of Pix2Struct architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deplot/chunk_14.txt b/chunked/content_aware_chunking/model_doc_deplot/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..c516abbe4966b53b1497e49454ffbf38a39575d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deplot/chunk_14.txt @@ -0,0 +1 @@ +It renders the input question on the image and predicts the answer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deplot/chunk_15.txt b/chunked/content_aware_chunking/model_doc_deplot/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3939e26fb4b4c9f5318677c45d7e33ea8b650948 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deplot/chunk_15.txt @@ -0,0 +1,19 @@ +Usage example +Currently one checkpoint is available for DePlot: + +google/deplot: DePlot fine-tuned on ChartQA dataset + +thon +from transformers import AutoProcessor, Pix2StructForConditionalGeneration +import requests +from PIL import Image +model = Pix2StructForConditionalGeneration.from_pretrained("google/deplot") +processor = AutoProcessor.from_pretrained("google/deplot") +url = "https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/5090.png" +image = Image.open(requests.get(url, stream=True).raw) +inputs = processor(images=image, text="Generate underlying data table of the figure below:", return_tensors="pt") +predictions = model.generate(**inputs, max_new_tokens=512) +print(processor.decode(predictions[0], skip_special_tokens=True)) + +Fine-tuning +To fine-tune DePlot, refer to the pix2struct fine-tuning notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deplot/chunk_16.txt b/chunked/content_aware_chunking/model_doc_deplot/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d53c506baf4a63861148289836016fe76132984 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deplot/chunk_16.txt @@ -0,0 +1,7 @@ +For Pix2Struct models, we have found out that fine-tuning the model with Adafactor and cosine learning rate scheduler leads to faster convergence: +thon +from transformers.optimization import Adafactor, get_cosine_schedule_with_warmup +optimizer = Adafactor(self.parameters(), scale_parameter=False, relative_step=False, lr=0.01, weight_decay=1e-05) +scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=40000) + +DePlot is a model trained using Pix2Struct architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deplot/chunk_17.txt b/chunked/content_aware_chunking/model_doc_deplot/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b1c09d1c3769d4db92d15420d2c1cf731b8e366 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deplot/chunk_17.txt @@ -0,0 +1 @@ +For API reference, see Pix2Struct documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deplot/chunk_7.txt b/chunked/content_aware_chunking/model_doc_deplot/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..42dc2f60b54b328f0ff4783da814182ac6cee593 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deplot/chunk_7.txt @@ -0,0 +1 @@ +The output of DePlot can then be directly used to prompt a pretrained large language model (LLM), exploiting the few-shot reasoning capabilities of LLMs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deplot/chunk_8.txt b/chunked/content_aware_chunking/model_doc_deplot/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1a29a977be90784055ae3cb0a8fb2a494c69a82 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deplot/chunk_8.txt @@ -0,0 +1 @@ +To obtain DePlot, we standardize the plot-to-table task by establishing unified task formats and metrics, and train DePlot end-to-end on this task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deplot/chunk_9.txt b/chunked/content_aware_chunking/model_doc_deplot/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..71b2c2a1d72ab2c20eff8b2a52d629ba42fce958 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deplot/chunk_9.txt @@ -0,0 +1 @@ +DePlot can then be used off-the-shelf together with LLMs in a plug-and-play fashion. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_10.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2c67b74afc2f89f02d78314439ae5f5b05e82ba --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_10.txt @@ -0,0 +1 @@ +It demonstrates impressive generalization ability. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_11.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7e595d3b9b5b060e0372661da05597fd13b9592 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_11.txt @@ -0,0 +1 @@ +Further, through fine-tuning it with metric depth information from NYUv2 and KITTI, new SOTAs are set. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_12.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..686dcd9117405fac815231e0a97e1140360c0d50 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_12.txt @@ -0,0 +1 @@ +Our better depth model also results in a better depth-conditioned ControlNet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_13.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05c40c972d3907fd4f8951fc15845658ef3ff903 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_13.txt @@ -0,0 +1 @@ +Depth Anything overview. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_14.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_14.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_15.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_15.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_16.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_16.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_17.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..69687a82fb7d3ff2d0a0c7419e10d63c56e7f954 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_17.txt @@ -0,0 +1,2 @@ +Usage example +There are 2 main ways to use Depth Anything: either using the pipeline API, which abstracts away all the complexity for you, or by using the DepthAnythingForDepthEstimation class yourself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_18.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbb3bee4b39a9b8240903b0eca613a1cb6c72f16 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_18.txt @@ -0,0 +1,47 @@ +Pipeline API +The pipeline allows to use the model in a few lines of code: +thon + +from transformers import pipeline +from PIL import Image +import requests +load pipe +pipe = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-small-hf") +load image +url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +image = Image.open(requests.get(url, stream=True).raw) +inference +depth = pipe(image)["depth"] + +Using the model yourself +If you want to do the pre- and postprocessing yourself, here's how to do that: +thon + +from transformers import AutoImageProcessor, AutoModelForDepthEstimation +import torch +import numpy as np +from PIL import Image +import requests +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +image_processor = AutoImageProcessor.from_pretrained("LiheYoung/depth-anything-small-hf") +model = AutoModelForDepthEstimation.from_pretrained("LiheYoung/depth-anything-small-hf") +prepare image for the model +inputs = image_processor(images=image, return_tensors="pt") +with torch.no_grad(): + outputs = model(**inputs) + predicted_depth = outputs.predicted_depth +interpolate to original size +prediction = torch.nn.functional.interpolate( + predicted_depth.unsqueeze(1), + size=image.size[::-1], + mode="bicubic", + align_corners=False, + ) +visualize the prediction +output = prediction.squeeze().cpu().numpy() +formatted = (output * 255 / np.max(output)).astype("uint8") +depth = Image.fromarray(formatted) + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Depth Anything. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_19.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..393c9085bbb7a0ae09bdb7cd9f459dd21ad9834b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_19.txt @@ -0,0 +1,2 @@ +Monocular depth estimation task guide +A notebook showcasing inference with [DepthAnythingForDepthEstimation] can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_20.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..b31de82274061c061863f7d69a83c223187d5418 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_20.txt @@ -0,0 +1,3 @@ +🌎 + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_21.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_21.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_22.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ebc119a6a1b8718c9fdb55d1ef86164855dfc21 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_22.txt @@ -0,0 +1,5 @@ +DepthAnythingConfig +[[autodoc]] DepthAnythingConfig +DepthAnythingForDepthEstimation +[[autodoc]] DepthAnythingForDepthEstimation + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_7.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a3943a6a3ba8af2cfde27f42714109d134c9def --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_7.txt @@ -0,0 +1 @@ +It compels the model to actively seek extra visual knowledge and acquire robust representations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_8.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..cccff5003480b9b60c49d467b90955e28e85d0c2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_8.txt @@ -0,0 +1 @@ +Second, an auxiliary supervision is developed to enforce the model to inherit rich semantic priors from pre-trained encoders. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_depth_anything/chunk_9.txt b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..97346f97e456af8c38d3d7477b3821d5727c3f64 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_depth_anything/chunk_9.txt @@ -0,0 +1 @@ +We evaluate its zero-shot capabilities extensively, including six public datasets and randomly captured photos. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_10.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6aecf1bccf60b6b7094febcf5d57bec314cc29c8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_10.txt @@ -0,0 +1 @@ +Furthermore, we attribute the success of detection transformers to their expressive transformer architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_11.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b9add59aa3dcc5c0eab5ab64e1a62497d506f38 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_11.txt @@ -0,0 +1 @@ +DETA overview. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_12.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_12.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_13.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_14.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_15.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0cd6feee18719e81e17e14a345f2d01908cac1e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_15.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DETA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_16.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..334e4385586388872c9e708bdb1d56db86c67441 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_16.txt @@ -0,0 +1 @@ +Demo notebooks for DETA can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_17.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..648cbfec7e58238ec9c7eea0f74605f31b779585 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_17.txt @@ -0,0 +1,3 @@ +See also: Object detection task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_18.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_18.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_19.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..98eb43007f95889f8de50008849919fdaec05997 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_19.txt @@ -0,0 +1,12 @@ +DetaConfig +[[autodoc]] DetaConfig +DetaImageProcessor +[[autodoc]] DetaImageProcessor + - preprocess + - post_process_object_detection +DetaModel +[[autodoc]] DetaModel + - forward +DetaForObjectDetection +[[autodoc]] DetaForObjectDetection + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_6.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc38ba9a86797555daceb787bee5961736194184 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_6.txt @@ -0,0 +1 @@ +In this work, we conduct a strict comparison between the one-to-one Hungarian matching in DETRs and the one-to-many label assignments in traditional detectors with non-maximum supervision (NMS). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_7.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..826dd3c162fe08755a3e99c5904836855937498e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_7.txt @@ -0,0 +1 @@ +Surprisingly, we observe one-to-many assignments with NMS consistently outperform standard one-to-one matching under the same setting, with a significant gain of up to 2.5 mAP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_8.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd10d712951235d20af63751e7252392c0bdb99b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_8.txt @@ -0,0 +1 @@ +Our detector that trains Deformable-DETR with traditional IoU-based label assignment achieved 50.2 COCO mAP within 12 epochs (1x schedule) with ResNet50 backbone, outperforming all existing traditional or transformer-based detectors in this setting. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_deta/chunk_9.txt b/chunked/content_aware_chunking/model_doc_deta/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..37df471d6a5090746a6c583fd5856533f8c040d3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_deta/chunk_9.txt @@ -0,0 +1 @@ +On multiple datasets, schedules, and architectures, we consistently show bipartite matching is unnecessary for performant detection transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_25.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..052fb7857e134490eaef5fd0b702974a10bc8058 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_25.txt @@ -0,0 +1,2 @@ +These input embeddings are learnt positional encodings that the authors refer to as object queries, and similarly to +the encoder, they are added to the input of each attention layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_26.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb94c8384d58cc914fc6fdfcfedfcd58d7a3c0c4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_26.txt @@ -0,0 +1,2 @@ +Each object query will look for a particular object +in the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_27.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fdb49f49e4b9f0fe778f1420fbf7a05cbeefbd2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_27.txt @@ -0,0 +1,2 @@ +The decoder updates these embeddings through multiple self-attention and encoder-decoder attention layers +to output decoder_hidden_states of the same shape: (batch_size, num_queries, d_model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_28.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..625440e4708e1064a6de1ae1ab1bbef872067d27 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_28.txt @@ -0,0 +1,3 @@ +Next, two heads +are added on top for object detection: a linear layer for classifying each object query into one of the objects or "no +object", and a MLP to predict bounding boxes for each query. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_29.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2b64d5ea13ad250b83c4c19ab80306b7e2ab165 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_29.txt @@ -0,0 +1,4 @@ +The model is trained using a bipartite matching loss: so what we actually do is compare the predicted classes + +bounding boxes of each of the N = 100 object queries to the ground truth annotations, padded up to the same length N +(so if an image only contains 4 objects, 96 annotations will just have a "no object" as class and "no bounding box" as +bounding box). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_30.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..7831fa942909884f27e624b1b63dc7b7d8e12f48 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_30.txt @@ -0,0 +1,2 @@ +The Hungarian matching algorithm is used to find +an optimal one-to-one mapping of each of the N queries to each of the N annotations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_31.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3a97f1f09064ac82594195127620cb99979020b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_31.txt @@ -0,0 +1,3 @@ +Next, standard cross-entropy (for +the classes) and a linear combination of the L1 and generalized IoU loss (for the +bounding boxes) are used to optimize the parameters of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_32.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..86c1114e8f42bebb627c2c335ac5a58152a9f3cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_32.txt @@ -0,0 +1,2 @@ +DETR can be naturally extended to perform panoptic segmentation (which unifies semantic segmentation and instance +segmentation). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_33.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5daa9786ac54d08b93cb7f29fde95b23f7c2e42 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_33.txt @@ -0,0 +1,2 @@ +[~transformers.DetrForSegmentation] adds a segmentation mask head on top of +[~transformers.DetrForObjectDetection]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_34.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..23c86ce584c0dc600707587be76b7fb5ed849d31 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_34.txt @@ -0,0 +1,4 @@ +The mask head can be trained either jointly, or in a two steps process, +where one first trains a [~transformers.DetrForObjectDetection] model to detect bounding boxes around both +"things" (instances) and "stuff" (background things like trees, roads, sky), then freeze all the weights and train only +the mask head for 25 epochs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_35.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7d7178b00dcb1384b005c96c650f46564084afa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_35.txt @@ -0,0 +1 @@ +Experimentally, these two approaches give similar results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_36.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcf50747d37def23b649a06e071451bf73dff897 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_36.txt @@ -0,0 +1,2 @@ +Note that predicting boxes is +required for the training to be possible, since the Hungarian matching is computed using distances between boxes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_37.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ce4263127c06ac893ee9a41b59dd70e3e8e5d8f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_37.txt @@ -0,0 +1,3 @@ +Usage tips + +DETR uses so-called object queries to detect objects in an image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_38.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9af8336e55628efdac237e43430bd2142f1daf0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_38.txt @@ -0,0 +1,3 @@ +The number of queries determines the maximum + number of objects that can be detected in a single image, and is set to 100 by default (see parameter + num_queries of [~transformers.DetrConfig]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_39.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a33578fd153dd2ae32271fe2cb46923e03103de --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_39.txt @@ -0,0 +1,2 @@ +Note that it's good to have some slack (in COCO, the + authors used 100, while the maximum number of objects in a COCO image is ~70). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_40.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..31fcffe62c4e68bd644b49a2763fa39b133097eb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_40.txt @@ -0,0 +1 @@ +The decoder of DETR updates the query embeddings in parallel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_41.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..d101c6b3e863c5b353d3ffc9c155e3a37bdf7efc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_41.txt @@ -0,0 +1,2 @@ +This is different from language models like GPT-2, + which use autoregressive decoding instead of parallel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_42.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd72e602edf59400686cfac3fbbf8bd01e2dca88 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_42.txt @@ -0,0 +1 @@ +Hence, no causal attention mask is used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_43.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..161444a0117f97ffe13ca71d372706f8a62a8e79 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_43.txt @@ -0,0 +1,2 @@ +DETR adds position embeddings to the hidden states at each self-attention and cross-attention layer before projecting + to queries and keys. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_44.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..c74560ddd3d410172e6cbca643305f85f5bc70b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_44.txt @@ -0,0 +1,2 @@ +For the position embeddings of the image, one can choose between fixed sinusoidal or learned + absolute position embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_45.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..27f3493a4ea3c67f35de7b17b8cc7d6d3ea3a020 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_45.txt @@ -0,0 +1,2 @@ +By default, the parameter position_embedding_type of + [~transformers.DetrConfig] is set to "sine". \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_46.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..8aaf2f9e3807329a7245f00c9c8c35a1bd377f2d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_46.txt @@ -0,0 +1,2 @@ +During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help + the model output the correct number of objects of each class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_47.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e19ec36d685a630e312b57f1d6e0ce4dd35f453 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_47.txt @@ -0,0 +1,3 @@ +If you set the parameter auxiliary_loss of + [~transformers.DetrConfig] to True, then prediction feedforward neural networks and Hungarian losses + are added after each decoder layer (with the FFNs sharing parameters). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_48.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..18f822278c1152d37f7198a40ac0c0b6d1514dc6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_48.txt @@ -0,0 +1,2 @@ +If you want to train the model in a distributed environment across multiple nodes, then one should update the + num_boxes variable in the DetrLoss class of modeling_detr.py. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_49.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..96f323bcc84167aef66065b5e6836fa61cb23fad --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_49.txt @@ -0,0 +1,2 @@ +When training on multiple nodes, this should be + set to the average number of target boxes across all nodes, as can be seen in the original implementation here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_50.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..e41a897144706c3d7e02c986c79f826efa2b8fe8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_50.txt @@ -0,0 +1,2 @@ +[~transformers.DetrForObjectDetection] and [~transformers.DetrForSegmentation] can be initialized with + any convolutional backbone available in the timm library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_51.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..cebc72f9a71347a70a74f18845f014842c4e2e9a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_51.txt @@ -0,0 +1,3 @@ +Initializing with a MobileNet backbone for example can be done by setting the backbone attribute of + [~transformers.DetrConfig] to "tf_mobilenetv3_small_075", and then initializing the model with that + config. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_52.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..26300f0f112f115dc302a259449b2a330cd4c22c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_52.txt @@ -0,0 +1,2 @@ +DETR resizes the input images such that the shortest side is at least a certain amount of pixels while the longest is + at most 1333 pixels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_53.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d59b3e178c86be1c4c50ce696d1511da28f383b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_53.txt @@ -0,0 +1,2 @@ +At training time, scale augmentation is used such that the shortest side is randomly set to at + least 480 and at most 800 pixels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_54.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb24e6dc49860c44a7e56a3a918ad667ce439320 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_54.txt @@ -0,0 +1 @@ +At inference time, the shortest side is set to 800. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_55.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e4cd8f008a5aa7bce23ec3e1a32c94fe62ce4ed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_55.txt @@ -0,0 +1,3 @@ +One can use + [~transformers.DetrImageProcessor] to prepare images (and optional annotations in COCO format) for the + model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_56.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e2f587218dcb3b116ff0f4852761bae3f47b320 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_56.txt @@ -0,0 +1 @@ +Due to this resizing, images in a batch can have different sizes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_57.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0b17dd8503630966b1c919f06edd75d721f42f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_57.txt @@ -0,0 +1,2 @@ +DETR solves this by padding images up to the + largest size in a batch, and by creating a pixel mask that indicates which pixels are real/which are padding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_58.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2381f97afc5b67021c24c61fc53ccb021bce8b5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_58.txt @@ -0,0 +1,2 @@ +Alternatively, one can also define a custom collate_fn in order to batch images together, using + [~transformers.DetrImageProcessor.pad_and_create_pixel_mask]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_59.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..52ddb7de30fc408c96b0ec3f7c3f52789ce44620 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_59.txt @@ -0,0 +1 @@ +The size of the images will determine the amount of memory being used, and will thus determine the batch_size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_60.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..10984cd2044b89553db92ffdbc0444d7fa46aa39 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_60.txt @@ -0,0 +1 @@ +It is advised to use a batch size of 2 per GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_61.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..1294a63c76b7592ba3349ee73049beadc01cc1c3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_61.txt @@ -0,0 +1 @@ +See this Github thread for more info. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_62.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..831a9bf56883789fdd23c1d7f2f8fb2cc61f7192 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_62.txt @@ -0,0 +1,19 @@ +There are three ways to instantiate a DETR model (depending on what you prefer): +Option 1: Instantiate DETR with pre-trained weights for entire model + +from transformers import DetrForObjectDetection +model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50") + +Option 2: Instantiate DETR with randomly initialized weights for Transformer, but pre-trained weights for backbone + +from transformers import DetrConfig, DetrForObjectDetection +config = DetrConfig() +model = DetrForObjectDetection(config) +Option 3: Instantiate DETR with randomly initialized weights for backbone + Transformerpy +config = DetrConfig(use_pretrained_backbone=False) +model = DetrForObjectDetection(config) + +As a summary, consider the following table: +| Task | Object detection | Instance segmentation | Panoptic segmentation | +|------|------------------|-----------------------|-----------------------| +| Description | Predicting bounding boxes and class labels around objects in an image | Predicting masks around objects (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_63.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e7e6eca9e2cc9537fae0ea82bfa94fbe8fc34ae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_63.txt @@ -0,0 +1 @@ +instances) in an image | Predicting masks around both objects (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_64.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..47b82f20518772d807c2f6129d38d21a35f4f0ad --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_64.txt @@ -0,0 +1 @@ +instances) as well as "stuff" (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_65.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a7647f8a8c3c6d268b5b5bdeb04b21269a43074 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_65.txt @@ -0,0 +1,5 @@ +background things like trees and roads) in an image | +| Model | [~transformers.DetrForObjectDetection] | [~transformers.DetrForSegmentation] | [~transformers.DetrForSegmentation] | +| Example dataset | COCO detection | COCO detection, COCO panoptic | COCO panoptic | | +| Format of annotations to provide to [~transformers.DetrImageProcessor] | {'image_id': int, 'annotations': List[Dict]} each Dict being a COCO object annotation | {'image_id': int, 'annotations': List[Dict]} (in case of COCO detection) or {'file_name': str, 'image_id': int, 'segments_info': List[Dict]} (in case of COCO panoptic) | {'file_name': str, 'image_id': int, 'segments_info': List[Dict]} and masks_path (path to directory containing PNG files of the masks) | +| Postprocessing (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_66.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae0d4e9d7604afb2981421c83ed492ef56124118 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_66.txt @@ -0,0 +1,5 @@ +converting the output of the model to Pascal VOC format) | [~transformers.DetrImageProcessor.post_process] | [~transformers.DetrImageProcessor.post_process_segmentation] | [~transformers.DetrImageProcessor.post_process_segmentation], [~transformers.DetrImageProcessor.post_process_panoptic] | +| evaluators | CocoEvaluator with iou_types="bbox" | CocoEvaluator with iou_types="bbox" or "segm" | CocoEvaluator with iou_tupes="bbox" or "segm", PanopticEvaluator | +In short, one should prepare the data either in COCO detection or COCO panoptic format, then use +[~transformers.DetrImageProcessor] to create pixel_values, pixel_mask and optional +labels, which can then be used to train (or fine-tune) a model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_67.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..d92273bb8f6df01c997999be2213cd87b7dd1fd4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_67.txt @@ -0,0 +1,2 @@ +For evaluation, one should first convert the +outputs of the model using one of the postprocessing methods of [~transformers.DetrImageProcessor]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_68.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd333eca8a07a02009a09082a116ce47eaee7970 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_68.txt @@ -0,0 +1,3 @@ +These can +be be provided to either CocoEvaluator or PanopticEvaluator, which allow you to calculate metrics like +mean Average Precision (mAP) and Panoptic Quality (PQ). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_69.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6a0491ff0a45284ae8b776872327e1e51b3ff78 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_69.txt @@ -0,0 +1 @@ +The latter objects are implemented in the original repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_70.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..389253727365a9f5561f4c98046b1ac9d618a768 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_70.txt @@ -0,0 +1 @@ +See the example notebooks for more info regarding evaluation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_71.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..6161b529af04b9374788a44def91ccf3d7850983 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_71.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DETR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_72.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..88d31ec0cb6b8efb74bcaaaa7fe4aa7a7476b80b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_72.txt @@ -0,0 +1 @@ +All example notebooks illustrating fine-tuning [DetrForObjectDetection] and [DetrForSegmentation] on a custom dataset an be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_73.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..648cbfec7e58238ec9c7eea0f74605f31b779585 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_73.txt @@ -0,0 +1,3 @@ +See also: Object detection task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_74.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_74.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_detr/chunk_75.txt b/chunked/content_aware_chunking/model_doc_detr/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2ce698a51a639470e830ea12992c72af55db38e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_detr/chunk_75.txt @@ -0,0 +1,29 @@ +DetrConfig +[[autodoc]] DetrConfig +DetrImageProcessor +[[autodoc]] DetrImageProcessor + - preprocess + - post_process_object_detection + - post_process_semantic_segmentation + - post_process_instance_segmentation + - post_process_panoptic_segmentation +DetrFeatureExtractor +[[autodoc]] DetrFeatureExtractor + - call + - post_process_object_detection + - post_process_semantic_segmentation + - post_process_instance_segmentation + - post_process_panoptic_segmentation +DETR specific outputs +[[autodoc]] models.detr.modeling_detr.DetrModelOutput +[[autodoc]] models.detr.modeling_detr.DetrObjectDetectionOutput +[[autodoc]] models.detr.modeling_detr.DetrSegmentationOutput +DetrModel +[[autodoc]] DetrModel + - forward +DetrForObjectDetection +[[autodoc]] DetrForObjectDetection + - forward +DetrForSegmentation +[[autodoc]] DetrForSegmentation + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dialogpt/chunk_10.txt b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6004c3c49b9096c239eb8ae8983cd12330ce5d9b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_10.txt @@ -0,0 +1,2 @@ +Training: +In order to train or fine-tune DialoGPT, one can use causal language modeling training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dialogpt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b55572c73a142504b90d88eb37d32f045ed0174 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_11.txt @@ -0,0 +1,3 @@ +To cite the official paper: We +follow the OpenAI GPT-2 to model a multiturn dialogue session as a long text and frame the generation task as language +modeling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dialogpt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6200375ae2b9ff01e9d1db2ab121fbff63eada3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_12.txt @@ -0,0 +1,2 @@ +We first concatenate all dialog turns within a dialogue session into a long text x_1,, x_N (N is the +sequence length), ended by the end-of-text token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dialogpt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f56434af6c82de963d851f61a707be054486d75 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_13.txt @@ -0,0 +1 @@ +For more information please confer to the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dialogpt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..b820f7fa3a76d601640311ec3c9f72a9d46bb797 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_14.txt @@ -0,0 +1 @@ +DialoGPT's architecture is based on the GPT2 model, refer to GPT2's documentation page for API reference and examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dialogpt/chunk_5.txt b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..da5f9528f109198c9fcff9eb47e9067ca7b92f7e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_5.txt @@ -0,0 +1,2 @@ +The pre-trained model and training pipeline are publicly released to facilitate research into neural response +generation and the development of more intelligent open-domain dialogue systems. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dialogpt/chunk_6.txt b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_6.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dialogpt/chunk_7.txt b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7ef1627958b83006a591624b502e8d3e78e55d4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_7.txt @@ -0,0 +1,4 @@ +Usage tips + +DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather + than the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dialogpt/chunk_8.txt b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..f36fbb13b4982bde074fae3fa3a47fcb885b9058 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_8.txt @@ -0,0 +1,2 @@ +DialoGPT was trained with a causal language modeling (CLM) objective on conversational data and is therefore powerful + at response generation in open-domain dialogue systems. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dialogpt/chunk_9.txt b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..47bc0ff87d0c1a2dcc96917ce785441b14348c38 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dialogpt/chunk_9.txt @@ -0,0 +1 @@ +DialoGPT enables the user to create a chat bot in just 10 lines of code as shown on DialoGPT's model card. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_10.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0e7d3d336d3f02659af572b4a909daf012ce1fc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_10.txt @@ -0,0 +1,2 @@ +Paired with new frameworks, our large variant is the new state of the art panoptic segmentation model on COCO (58.2 PQ) +and ADE20K (48.5 PQ), and instance segmentation model on Cityscapes (44.5 AP) and ADE20K (35.4 AP) (no extra data). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_11.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4af5f71632c21149325174631adc7b993e8560e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_11.txt @@ -0,0 +1,2 @@ +It also matches the state of the art specialized semantic segmentation models on ADE20K (58.2 mIoU), +and ranks second on Cityscapes (84.5 mIoU) (no extra data). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_12.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..52dcf6747d9df263523611fca99ef27893dc2b6c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_12.txt @@ -0,0 +1,3 @@ +* + + Neighborhood Attention with different dilation values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_13.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_13.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_14.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..be967ace8a50cc07c821db5cea592e0657a9123a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_14.txt @@ -0,0 +1 @@ +This model was contributed by Ali Hassani. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_15.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_15.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_16.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..854c8384b9871463148140cb7247e098674b8b91 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_16.txt @@ -0,0 +1,2 @@ +Usage tips +DiNAT can be used as a backbone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_17.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8436ad15d61ede6b4cbad9fcec731c0a9bef3219 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_17.txt @@ -0,0 +1,2 @@ +When output_hidden_states = True, +it will output both hidden_states and reshaped_hidden_states. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_18.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dfa3d400638f3840ee438e483dbf0b5c9ee2a80 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_18.txt @@ -0,0 +1 @@ +The reshaped_hidden_states have a shape of (batch, num_channels, height, width) rather than (batch_size, height, width, num_channels). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_19.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5312561f5159dad1a6701004a302c8a753e0136 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_19.txt @@ -0,0 +1,2 @@ +Notes: +- DiNAT depends on NATTEN's implementation of Neighborhood Attention and Dilated Neighborhood Attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_20.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f8edb9bf00914975b58adca5165d5c69e46ca09 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_20.txt @@ -0,0 +1 @@ +You can install it with pre-built wheels for Linux by referring to shi-labs.com/natten, or build on your system by running pip install natten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_21.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c4d10e3c11ce2138f41f46f874e53cda6493362 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_21.txt @@ -0,0 +1 @@ +Note that the latter will likely take time to compile. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_22.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0e4261edf35c392458979a925aae606347d8dbc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_22.txt @@ -0,0 +1 @@ +NATTEN does not support Windows devices yet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_23.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..c51930e990013fd56cc6990deb6de1de4090e99b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_23.txt @@ -0,0 +1 @@ +- Patch size of 4 is only supported at the moment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_24.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2b0575e4f6ff6fb09995eb1b5728536f2f8ac3c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_24.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DiNAT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_25.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..59b6e9012590dd9ab5155aa1b971868a2b5130d0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_25.txt @@ -0,0 +1 @@ +[DinatForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_26.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_26.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_27.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_27.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_28.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..05ee1567be0b9df9d4453ea49d9f9566e8bf8360 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_28.txt @@ -0,0 +1,8 @@ +DinatConfig +[[autodoc]] DinatConfig +DinatModel +[[autodoc]] DinatModel + - forward +DinatForImageClassification +[[autodoc]] DinatForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_7.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..f639a934458b9fade69b012d068ae54531c8de05 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_7.txt @@ -0,0 +1,2 @@ +NA's local attention and DiNA's sparse global attention complement each other, and therefore we +introduce Dilated Neighborhood Attention Transformer (DiNAT), a new hierarchical vision transformer built upon both. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_8.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ab1c685dd3bc5cca3b9efe38577258571601328 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_8.txt @@ -0,0 +1 @@ +DiNAT variants enjoy significant improvements over strong baselines such as NAT, Swin, and ConvNeXt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinat/chunk_9.txt b/chunked/content_aware_chunking/model_doc_dinat/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2150d751a143f7cce460213ee06f1629cf9387f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinat/chunk_9.txt @@ -0,0 +1,2 @@ +Our large model is faster and ahead of its Swin counterpart by 1.5% box AP in COCO object detection, +1.3% mask AP in COCO instance segmentation, and 1.1% mIoU in ADE20K semantic segmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e904c49e3f09338a888f8ff91e5735a2533f23e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_12.txt @@ -0,0 +1,2 @@ +Usage tips +The model can be traced using torch.jit.trace which leverages JIT compilation to optimize the model making it faster to run. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..92ad4859e9466115279d336652375779437fcc5d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_13.txt @@ -0,0 +1 @@ +Note this still produces some mis-matched elements and the difference between the original model and the traced model is of the order of 1e-4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd2b6094cead741a50ce95c4730f9cfd3322e285 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_14.txt @@ -0,0 +1,21 @@ +thon +import torch +from transformers import AutoImageProcessor, AutoModel +from PIL import Image +import requests +url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +image = Image.open(requests.get(url, stream=True).raw) +processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base') +model = AutoModel.from_pretrained('facebook/dinov2-base') +inputs = processor(images=image, return_tensors="pt") +outputs = model(**inputs) +last_hidden_states = outputs[0] +We have to force return_dict=False for tracing +model.config.return_dict = False +with torch.no_grad(): + traced_model = torch.jit.trace(model, [inputs.pixel_values]) + traced_outputs = traced_model(inputs.pixel_values) +print((last_hidden_states - traced_outputs[0]).abs().max()) + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DPT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c8196007ae4a41d183b0d0793bd357a23698d85 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_15.txt @@ -0,0 +1 @@ +Demo notebooks for DINOv2 can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ac9ae7fe2b7eced983b93bd89f4155d02f17d52 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_16.txt @@ -0,0 +1,3 @@ +🌎 + +[Dinov2ForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_17.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_17.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_18.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_18.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_19.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..9448621fe7af88c2070513ae2f75e91774a22006 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_19.txt @@ -0,0 +1,8 @@ +Dinov2Config +[[autodoc]] Dinov2Config +Dinov2Model +[[autodoc]] Dinov2Model + - forward +Dinov2ForImageClassification +[[autodoc]] Dinov2ForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dd04bd776522cf838bd6173df918511397f1f79 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_8.txt @@ -0,0 +1 @@ +In terms of data, we propose an automatic pipeline to build a dedicated, diverse, and curated image dataset instead of uncurated data, as typically done in the self-supervised literature. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dinov2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_dinov2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..99240268ceb92cbe548c8c0d49e94ad1cdd79916 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dinov2/chunk_9.txt @@ -0,0 +1 @@ +In terms of models, we train a ViT model (Dosovitskiy et al., 2020) with 1B parameters and distill it into a series of smaller models that surpass the best available all-purpose features, OpenCLIP (Ilharco et al., 2021) on most of the benchmarks at image and pixel levels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..14127f97f6880ac30fa138fc8d9b94cf1a6014dc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_12.txt @@ -0,0 +1,2 @@ +Just + separate your segments with the separation token tokenizer.sep_token (or [SEP]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..61aca44748cf16f1ffa7a06e8d0b9b93e8602f92 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_13.txt @@ -0,0 +1 @@ +DistilBERT doesn't have options to select the input positions (position_ids input). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_14.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ffc204dade7309c71aa5e02f361a7bcf576e372 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_14.txt @@ -0,0 +1,2 @@ +This could be added if + necessary though, just let us know if you need this option. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_15.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..f36b86cb4c4d6e614b475dcccc9ba3503237261e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_15.txt @@ -0,0 +1 @@ +Same as BERT but smaller. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_16.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2f4035413704c85017a9c766ca67f7de99aa95b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_16.txt @@ -0,0 +1 @@ +Trained by distillation of the pretrained BERT model, meaning it’s been trained to predict the same probabilities as the larger model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_17.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d865a60b3471eb6c39d1fabb8292bb6cc01d893 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_17.txt @@ -0,0 +1,8 @@ +The actual objective is a combination of: + +finding the same probabilities as the teacher model +predicting the masked tokens correctly (but no next-sentence objective) +a cosine similarity between the hidden states of the student and the teacher model + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DistilBERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_18.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_18.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_19.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_19.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_20.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c739e0c868cce47d82d8353d5e2c156fe53aaa3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_20.txt @@ -0,0 +1 @@ +A blog post on Getting Started with Sentiment Analysis using Python with DistilBERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_21.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..38d9c69e16aca7cc3377efde3845599c45747287 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_21.txt @@ -0,0 +1 @@ +A blog post on how to train DistilBERT with Blurr for sequence classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_22.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..a032a7fc955912f4ebd8ff2738bce9d27f7f7c9a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_22.txt @@ -0,0 +1 @@ +A blog post on how to use Ray to tune DistilBERT hyperparameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_23.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd58369397eef41aa3932831de44f871ea075f5c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_23.txt @@ -0,0 +1 @@ +A blog post on how to train DistilBERT with Hugging Face and Amazon SageMaker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_24.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..53dc4b99aea6ab4c323efef523a87e9011f0bc7b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_24.txt @@ -0,0 +1 @@ +A notebook on how to finetune DistilBERT for multi-label classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_25.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c9ff90d1ac7e627c74b649e3dbe930f78eb8d07 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_25.txt @@ -0,0 +1,2 @@ +🌎 +A notebook on how to finetune DistilBERT for multiclass classification with PyTorch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_26.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f8a0d63260a137eb82ea62192ab1b77d6e48bd5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_26.txt @@ -0,0 +1,2 @@ +🌎 +A notebook on how to finetune DistilBERT for text classification in TensorFlow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_27.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc79c146bbeb5455bd5fcbdb474ae321e330e491 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_27.txt @@ -0,0 +1,2 @@ +🌎 +[DistilBertForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_28.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..f31d970fa66de1627ac3eb86d6bbeb606ad50f4d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_28.txt @@ -0,0 +1 @@ +[TFDistilBertForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_29.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..b62f5959a4c426cb06eaa1bbbdb6a76b20ebc40e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_29.txt @@ -0,0 +1 @@ +[FlaxDistilBertForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_30.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2d628319e2202755c9ce04c0cd06fb2d11613ef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_30.txt @@ -0,0 +1,3 @@ +Text classification task guide + +[DistilBertForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_31.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..23a5b6abf46e0426f95d9ccaafd6b856a20dd9ee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_31.txt @@ -0,0 +1 @@ +[TFDistilBertForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_32.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..707bc88948cf3b2b1dc54372222ac08cd28827c1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_32.txt @@ -0,0 +1 @@ +[FlaxDistilBertForTokenClassification] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_33.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..69e21faf2c5098fb807509f480ff122a6a2859c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_33.txt @@ -0,0 +1 @@ +Token classification chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_34.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c0b6ca605019d1195289f9f440c819660a2f191 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_34.txt @@ -0,0 +1,3 @@ +Token classification task guide + +[DistilBertForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_35.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..be0c60e428b796ff0c8848a5785558b89e1bea01 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_35.txt @@ -0,0 +1 @@ +[TFDistilBertForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_36.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..323faadf06ad4c2d83d55dd3ebc9df429f35210c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_36.txt @@ -0,0 +1 @@ +[FlaxDistilBertForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_37.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f2b5fefece97efd08b6147d0c598a5443817bec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_37.txt @@ -0,0 +1 @@ +Masked language modeling chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_38.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..3da759223cf5aa9297e2ee52c2878b2e66e7d0c8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_38.txt @@ -0,0 +1,3 @@ +Masked language modeling task guide + +[DistilBertForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_39.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4f51fac6681bd403a0dd8777e6050f7d6e9a428 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_39.txt @@ -0,0 +1 @@ +[TFDistilBertForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_40.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2b5362594758c754c1600598efe6413966528e9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_40.txt @@ -0,0 +1 @@ +[FlaxDistilBertForQuestionAnswering] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_41.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..38996d3f4fef4d6454d1d2c12acfb05d3bf81ec8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_41.txt @@ -0,0 +1 @@ +Question answering chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_42.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..0664dcc802f91ea2e7906cf61c3feca2d6c81331 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_42.txt @@ -0,0 +1,4 @@ +Question answering task guide + +Multiple choice +- [DistilBertForMultipleChoice] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_43.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9ad34c1c03012290f533a5be705338ea676467f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_43.txt @@ -0,0 +1 @@ +- [TFDistilBertForMultipleChoice] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_44.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c3161b258e71bed88172f7e7a77064841b51f76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_44.txt @@ -0,0 +1,4 @@ +- Multiple choice task guide +âš—ï¸ Optimization + +A blog post on how to quantize DistilBERT with 🤗 Optimum and Intel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_45.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..0dee8cd806848fa08ea6940a27db82ae5b52c023 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_45.txt @@ -0,0 +1 @@ +A blog post on how Optimizing Transformers for GPUs with 🤗 Optimum. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_46.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a97b3754e1818382240618be2cfa02f8c8a5da1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_46.txt @@ -0,0 +1 @@ +A blog post on Optimizing Transformers with Hugging Face Optimum. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_47.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..58c9208afa3dd0d93bb52f410b8878284483aba6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_47.txt @@ -0,0 +1,3 @@ +âš¡ï¸ Inference + +A blog post on how to Accelerate BERT inference with Hugging Face Transformers and AWS Inferentia with DistilBERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_48.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bb133698507dad1efce8f86dd3e93e71cbc065d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_48.txt @@ -0,0 +1 @@ +A blog post on Serverless Inference with Hugging Face's Transformers, DistilBERT and Amazon SageMaker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_49.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b29270d35af420622be6fc1479bb8175262e7bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_49.txt @@ -0,0 +1,3 @@ +🚀 Deploy + +A blog post on how to deploy DistilBERT on Google Cloud. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_50.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2b9e49b31026e2c30404430a277cbc827373e88 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_50.txt @@ -0,0 +1 @@ +A blog post on how to deploy DistilBERT with Amazon SageMaker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_51.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc8ff4d21b3055c5e2fc257c226f96bfe8787c9a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_51.txt @@ -0,0 +1 @@ +A blog post on how to Deploy BERT with Hugging Face Transformers, Amazon SageMaker and Terraform module. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_52.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4ef35179a15b2535f1da386172b644b81a06cf0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_52.txt @@ -0,0 +1,2 @@ +Combining DistilBERT and Flash Attention 2 +First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_53.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..43c8befe42aecbcd19b04130e166207c73a46264 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_53.txt @@ -0,0 +1,2 @@ +pip install -U flash-attn --no-build-isolation +Make also sure that you have a hardware that is compatible with Flash-Attention 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_54.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f54478ededdb4c998599671bfd287599f84cc76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_54.txt @@ -0,0 +1 @@ +Read more about it in the official documentation of flash-attn repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_55.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ef85380a7fd818d955330819ccf432ff686d273 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_55.txt @@ -0,0 +1 @@ +Make also sure to load your model in half-precision (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_56.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..93bfbf9fae086dd7cdf7820a0372dd09acabb711 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_56.txt @@ -0,0 +1,10 @@ +torch.float16) +To load and run a model using Flash Attention 2, refer to the snippet below: +thon + +import torch +from transformers import AutoTokenizer, AutoModel +device = "cuda" # the device to load the model onto +tokenizer = AutoTokenizer.from_pretrained('distilbert/distilbert-base-uncased') +model = AutoModel.from_pretrained("distilbert/distilbert-base-uncased", torch_dtype=torch.float16, attn_implementation="flash_attention_2") +text = "Replace me by any text you'd like." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_distilbert/chunk_57.txt b/chunked/content_aware_chunking/model_doc_distilbert/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..887b610019dd19fcd3f730cf1b1e2ea63be11183 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_distilbert/chunk_57.txt @@ -0,0 +1,67 @@ +encoded_input = tokenizer(text, return_tensors='pt').to(device) +model.to(device) +output = model(**encoded_input) + +DistilBertConfig +[[autodoc]] DistilBertConfig +DistilBertTokenizer +[[autodoc]] DistilBertTokenizer +DistilBertTokenizerFast +[[autodoc]] DistilBertTokenizerFast + +DistilBertModel +[[autodoc]] DistilBertModel + - forward +DistilBertForMaskedLM +[[autodoc]] DistilBertForMaskedLM + - forward +DistilBertForSequenceClassification +[[autodoc]] DistilBertForSequenceClassification + - forward +DistilBertForMultipleChoice +[[autodoc]] DistilBertForMultipleChoice + - forward +DistilBertForTokenClassification +[[autodoc]] DistilBertForTokenClassification + - forward +DistilBertForQuestionAnswering +[[autodoc]] DistilBertForQuestionAnswering + - forward + +TFDistilBertModel +[[autodoc]] TFDistilBertModel + - call +TFDistilBertForMaskedLM +[[autodoc]] TFDistilBertForMaskedLM + - call +TFDistilBertForSequenceClassification +[[autodoc]] TFDistilBertForSequenceClassification + - call +TFDistilBertForMultipleChoice +[[autodoc]] TFDistilBertForMultipleChoice + - call +TFDistilBertForTokenClassification +[[autodoc]] TFDistilBertForTokenClassification + - call +TFDistilBertForQuestionAnswering +[[autodoc]] TFDistilBertForQuestionAnswering + - call + +FlaxDistilBertModel +[[autodoc]] FlaxDistilBertModel + - call +FlaxDistilBertForMaskedLM +[[autodoc]] FlaxDistilBertForMaskedLM + - call +FlaxDistilBertForSequenceClassification +[[autodoc]] FlaxDistilBertForSequenceClassification + - call +FlaxDistilBertForMultipleChoice +[[autodoc]] FlaxDistilBertForMultipleChoice + - call +FlaxDistilBertForTokenClassification +[[autodoc]] FlaxDistilBertForTokenClassification + - call +FlaxDistilBertForQuestionAnswering +[[autodoc]] FlaxDistilBertForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_10.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c7b1227fad2f9cd1298d0710596ec5d3b6a1e7d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_10.txt @@ -0,0 +1 @@ +document image classification (91.11 → 92.69), document layout analysis (91.0 → 94.9) and table detection (94.23 → 96.55). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_11.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..420ccc83b2d7733d9d1b88b30aff02ad0d40030c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_11.txt @@ -0,0 +1,3 @@ +* + + Summary of the approach. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_12.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_12.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_13.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_14.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_15.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f6d9c16e71bbd0c0b88b5afe771aef0ec6c3dca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_15.txt @@ -0,0 +1,7 @@ +Usage tips +One can directly use the weights of DiT with the AutoModel API: +thon +from transformers import AutoModel +model = AutoModel.from_pretrained("microsoft/dit-base") + +This will load the model pre-trained on masked image modeling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_16.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9421cfed917a9839705bef22314f7a0b1731476 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_16.txt @@ -0,0 +1 @@ +Note that this won't include the language modeling head on top, used to predict visual tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_17.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..64547c7c6082b5f087f13f0e39a186886190e419 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_17.txt @@ -0,0 +1,11 @@ +To include the head, you can load the weights into a BeitForMaskedImageModeling model, like so: +thon +from transformers import BeitForMaskedImageModeling +model = BeitForMaskedImageModeling.from_pretrained("microsoft/dit-base") + +You can also load a fine-tuned model from the hub, like so: +thon +from transformers import AutoModelForImageClassification +model = AutoModelForImageClassification.from_pretrained("microsoft/dit-base-finetuned-rvlcdip") + +This particular checkpoint was fine-tuned on RVL-CDIP, an important benchmark for document image classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_18.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a905f7e5fbc97e1696c738c5920790788ffdfce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_18.txt @@ -0,0 +1 @@ +A notebook that illustrates inference for document image classification can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_19.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..95b1b1b10e35f7dbd57f27f58494d7032643ba35 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_19.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DiT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_20.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..d77944ce7cef77c14880486e76214b4f75d5be06 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_20.txt @@ -0,0 +1 @@ +[BeitForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_21.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_21.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_22.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_22.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_23.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..75ea3020318c6a856b6548a8121ab31b8571f065 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_23.txt @@ -0,0 +1 @@ +As DiT's architecture is equivalent to that of BEiT, one can refer to BEiT's documentation page for all tips, code examples and notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_6.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7fa26b5deabc2e951ae81aa5335cdb418eda0a7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_6.txt @@ -0,0 +1 @@ +pre-training techniques. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_7.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f722a9a56836b9f009e80dc6d9b359d3d55e76d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_7.txt @@ -0,0 +1 @@ +In this paper, we propose DiT, a self-supervised pre-trained Document Image Transformer model using large-scale unlabeled text images for Document AI tasks, which is essential since no supervised counterparts ever exist due to the lack of human labeled document images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_8.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..35912520cc081d8188a5d3c298f759954b5cef4b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_8.txt @@ -0,0 +1 @@ +We leverage DiT as the backbone network in a variety of vision-based Document AI tasks, including document image classification, document layout analysis, as well as table detection. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dit/chunk_9.txt b/chunked/content_aware_chunking/model_doc_dit/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..48583c44bbf88d3c034e21b40fdff11da354710f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dit/chunk_9.txt @@ -0,0 +1 @@ +Experiment results have illustrated that the self-supervised pre-trained DiT model achieves new state-of-the-art results on these downstream tasks, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_10.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0cb73c01193f6f6424ed28624744552d78d87da --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_10.txt @@ -0,0 +1 @@ +Donut high-level overview. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_11.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_11.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_12.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_13.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f16a20874db68468d36337353044b26ede99569 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_13.txt @@ -0,0 +1,2 @@ +The original code can be found +here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_14.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1c681d9056f896c4e80177e9913567584b16911 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_14.txt @@ -0,0 +1,5 @@ +Usage tips + +The quickest way to get started with Donut is by checking the tutorial + notebooks, which show how to use the model + at inference time as well as fine-tuning on custom data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_15.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccd684ecc7caaad6dfe0787844307fd0c6c5b97a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_15.txt @@ -0,0 +1 @@ +Donut is always used within the VisionEncoderDecoder framework. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_16.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..e488ec5b7ec18fc1005be05ad88cb4ccfaeafdd8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_16.txt @@ -0,0 +1,3 @@ +Inference examples +Donut's [VisionEncoderDecoder] model accepts images as input and makes use of +[~generation.GenerationMixin.generate] to autoregressively generate text given the input image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_17.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b4b189330a9389b7f1d39837fab7a8f5a58bfd7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_17.txt @@ -0,0 +1,2 @@ +The [DonutImageProcessor] class is responsible for preprocessing the input image and +[XLMRobertaTokenizer/XLMRobertaTokenizerFast] decodes the generated target tokens to the target string. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_18.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1817daa2d09477f12e5e8b3143b636fe4852897 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_18.txt @@ -0,0 +1,3 @@ +The +[DonutProcessor] wraps [DonutImageProcessor] and [XLMRobertaTokenizer/XLMRobertaTokenizerFast] +into a single instance to both extract the input features and decode the predicted token ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_19.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..60def79cc627b932ab57bf26b1c92729e5649507 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_19.txt @@ -0,0 +1,30 @@ +Step-by-step Document Image Classification + +import re +from transformers import DonutProcessor, VisionEncoderDecoderModel +from datasets import load_dataset +import torch +processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip") +model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-rvlcdip") +device = "cuda" if torch.cuda.is_available() else "cpu" +model.to(device) # doctest: +IGNORE_RESULT +load document image +dataset = load_dataset("hf-internal-testing/example-documents", split="test") +image = dataset[1]["image"] +prepare decoder inputs +task_prompt = "" +decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids +pixel_values = processor(image, return_tensors="pt").pixel_values +outputs = model.generate( + pixel_values.to(device), + decoder_input_ids=decoder_input_ids.to(device), + max_length=model.decoder.config.max_position_embeddings, + pad_token_id=processor.tokenizer.pad_token_id, + eos_token_id=processor.tokenizer.eos_token_id, + use_cache=True, + bad_words_ids=[[processor.tokenizer.unk_token_id]], + return_dict_in_generate=True, + ) +sequence = processor.batch_decode(outputs.sequences)[0] +sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") +sequence = re.sub(r"<. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_20.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd47d6ce62a2be1a369e6be3cb3650a0bb58a387 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_20.txt @@ -0,0 +1,34 @@ +*?>", "", sequence, count=1).strip() # remove first task start token +print(processor.token2json(sequence)) +{'class': 'advertisement'} + +Step-by-step Document Parsing + +import re +from transformers import DonutProcessor, VisionEncoderDecoderModel +from datasets import load_dataset +import torch +processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") +model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-cord-v2") +device = "cuda" if torch.cuda.is_available() else "cpu" +model.to(device) # doctest: +IGNORE_RESULT +load document image +dataset = load_dataset("hf-internal-testing/example-documents", split="test") +image = dataset[2]["image"] +prepare decoder inputs +task_prompt = "" +decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids +pixel_values = processor(image, return_tensors="pt").pixel_values +outputs = model.generate( + pixel_values.to(device), + decoder_input_ids=decoder_input_ids.to(device), + max_length=model.decoder.config.max_position_embeddings, + pad_token_id=processor.tokenizer.pad_token_id, + eos_token_id=processor.tokenizer.eos_token_id, + use_cache=True, + bad_words_ids=[[processor.tokenizer.unk_token_id]], + return_dict_in_generate=True, + ) +sequence = processor.batch_decode(outputs.sequences)[0] +sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") +sequence = re.sub(r"<. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_21.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c3f617067baa27cd3946861a6ca0ece2e2b9fea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_21.txt @@ -0,0 +1,20 @@ +*?>", "", sequence, count=1).strip() # remove first task start token +print(processor.token2json(sequence)) +{'menu': {'nm': 'CINNAMON SUGAR', 'unitprice': '17,000', 'cnt': '1 x', 'price': '17,000'}, 'sub_total': {'subtotal_price': '17,000'}, 'total': {'total_price': '17,000', 'cashprice': '20,000', 'changeprice': '3,000'}} + +Step-by-step Document Visual Question Answering (DocVQA) + +import re +from transformers import DonutProcessor, VisionEncoderDecoderModel +from datasets import load_dataset +import torch +processor = DonutProcessor.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa") +model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-finetuned-docvqa") +device = "cuda" if torch.cuda.is_available() else "cpu" +model.to(device) # doctest: +IGNORE_RESULT +load document image from the DocVQA dataset +dataset = load_dataset("hf-internal-testing/example-documents", split="test") +image = dataset[0]["image"] +prepare decoder inputs +task_prompt = "{user_input}" +question = "When is the coffee break?" \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_22.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..0163dd1883929502075b0a976cb7601088d7793f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_22.txt @@ -0,0 +1,16 @@ +prompt = task_prompt.replace("{user_input}", question) +decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids +pixel_values = processor(image, return_tensors="pt").pixel_values +outputs = model.generate( + pixel_values.to(device), + decoder_input_ids=decoder_input_ids.to(device), + max_length=model.decoder.config.max_position_embeddings, + pad_token_id=processor.tokenizer.pad_token_id, + eos_token_id=processor.tokenizer.eos_token_id, + use_cache=True, + bad_words_ids=[[processor.tokenizer.unk_token_id]], + return_dict_in_generate=True, + ) +sequence = processor.batch_decode(outputs.sequences)[0] +sequence = sequence.replace(processor.tokenizer.eos_token, "").replace(processor.tokenizer.pad_token, "") +sequence = re.sub(r"<. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_23.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ec0f994741d96d09d21f51c598f7bc622fe56b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_23.txt @@ -0,0 +1,3 @@ +*?>", "", sequence, count=1).strip() # remove first task start token +print(processor.token2json(sequence)) +{'question': 'When is the coffee break? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_24.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..848478147e4c98ab7bffd31b8e8492eed4be5476 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_24.txt @@ -0,0 +1,3 @@ +', 'answer': '11-14 to 11:39 a.m.'} + +See the model hub to look for Donut checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_25.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..917f1089d61bb2e7877df9c39a974f0ce0102f72 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_25.txt @@ -0,0 +1,2 @@ +Training +We refer to the tutorial notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_26.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..023de526d466738ce6f4b8014055ceab15107a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_26.txt @@ -0,0 +1,18 @@ +DonutSwinConfig +[[autodoc]] DonutSwinConfig +DonutImageProcessor +[[autodoc]] DonutImageProcessor + - preprocess +DonutFeatureExtractor +[[autodoc]] DonutFeatureExtractor + - call +DonutProcessor +[[autodoc]] DonutProcessor + - call + - from_pretrained + - save_pretrained + - batch_decode + - decode +DonutSwinModel +[[autodoc]] DonutSwinModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_7.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9feb04a83ed293c2e9fc377fd6560cdd4621421 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_7.txt @@ -0,0 +1 @@ +Donut is conceptually simple yet effective. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_8.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b648a3f99c56b180640c9568c9f71065111b22bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_8.txt @@ -0,0 +1 @@ +Through extensive experiments and analyses, we show a simple OCR-free VDU model, Donut, achieves state-of-the-art performances on various VDU tasks in terms of both speed and accuracy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_donut/chunk_9.txt b/chunked/content_aware_chunking/model_doc_donut/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..f00cff14ca0760308907923a436c71d4bacb761e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_donut/chunk_9.txt @@ -0,0 +1 @@ +In addition, we offer a synthetic data generator that helps the model pre-training to be flexible in various languages and domains. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpr/chunk_5.txt b/chunked/content_aware_chunking/model_doc_dpr/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..985d1c27d3868ad83a8f336e7d6ff79820115b9c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpr/chunk_5.txt @@ -0,0 +1 @@ +This model was contributed by lhoestq. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpr/chunk_6.txt b/chunked/content_aware_chunking/model_doc_dpr/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpr/chunk_6.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpr/chunk_7.txt b/chunked/content_aware_chunking/model_doc_dpr/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bd867c61d116044faa4a16400c30c9282b0a36c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpr/chunk_7.txt @@ -0,0 +1,7 @@ +Usage tips + +DPR consists in three models: + +Question encoder: encode questions as vectors +Context encoder: encode contexts as vectors +Reader: extract the answer of the questions inside retrieved contexts, along with a relevance score (high if the inferred span actually answers the question). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpr/chunk_8.txt b/chunked/content_aware_chunking/model_doc_dpr/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c8f087ef8925aa5cf867697c7887c9d03d8c9a1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpr/chunk_8.txt @@ -0,0 +1,38 @@ +DPRConfig +[[autodoc]] DPRConfig +DPRContextEncoderTokenizer +[[autodoc]] DPRContextEncoderTokenizer +DPRContextEncoderTokenizerFast +[[autodoc]] DPRContextEncoderTokenizerFast +DPRQuestionEncoderTokenizer +[[autodoc]] DPRQuestionEncoderTokenizer +DPRQuestionEncoderTokenizerFast +[[autodoc]] DPRQuestionEncoderTokenizerFast +DPRReaderTokenizer +[[autodoc]] DPRReaderTokenizer +DPRReaderTokenizerFast +[[autodoc]] DPRReaderTokenizerFast +DPR specific outputs +[[autodoc]] models.dpr.modeling_dpr.DPRContextEncoderOutput +[[autodoc]] models.dpr.modeling_dpr.DPRQuestionEncoderOutput +[[autodoc]] models.dpr.modeling_dpr.DPRReaderOutput + +DPRContextEncoder +[[autodoc]] DPRContextEncoder + - forward +DPRQuestionEncoder +[[autodoc]] DPRQuestionEncoder + - forward +DPRReader +[[autodoc]] DPRReader + - forward + +TFDPRContextEncoder +[[autodoc]] TFDPRContextEncoder + - call +TFDPRQuestionEncoder +[[autodoc]] TFDPRQuestionEncoder + - call +TFDPRReader +[[autodoc]] TFDPRReader + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_10.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c67f75d3e0fbe33984589fee894a7aa70af88a21 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_10.txt @@ -0,0 +1 @@ +DPT architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_11.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4930832ccbae4d72632768e285626eaba882bab --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_14.txt @@ -0,0 +1,2 @@ +Usage tips +DPT is compatible with the [AutoBackbone] class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_15.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c79a71a3a360517c61370311876e584e84d4e56 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_15.txt @@ -0,0 +1 @@ +This allows to use the DPT framework with various computer vision backbones available in the library, such as [VitDetBackbone] or [Dinov2Backbone]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_16.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bc2dfa9ee4203bd6a529cdd1d3334c7f6d19246 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_16.txt @@ -0,0 +1,11 @@ +One can create it as follows: +thon +from transformers import Dinov2Config, DPTConfig, DPTForDepthEstimation +initialize with a Transformer-based backbone such as DINOv2 +in that case, we also specify reshape_hidden_states=False to get feature maps of shape (batch_size, num_channels, height, width) +backbone_config = Dinov2Config.from_pretrained("facebook/dinov2-base", out_features=["stage1", "stage2", "stage3", "stage4"], reshape_hidden_states=False) +config = DPTConfig(backbone_config=backbone_config) +model = DPTForDepthEstimation(config=config) + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with DPT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_17.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6be394731d113d248d2df566d57f17ff64b83f7e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_17.txt @@ -0,0 +1 @@ +Demo notebooks for [DPTForDepthEstimation] can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_18.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..f65e70258a806d37625fd9e410de18b78eed23c8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_18.txt @@ -0,0 +1,5 @@ +Semantic segmentation task guide + +Monocular depth estimation task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_19.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_19.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_20.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..105360b963ae33a00d5f7dbe0e6926aa98d444ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_20.txt @@ -0,0 +1,19 @@ +DPTConfig +[[autodoc]] DPTConfig +DPTFeatureExtractor +[[autodoc]] DPTFeatureExtractor + - call + - post_process_semantic_segmentation +DPTImageProcessor +[[autodoc]] DPTImageProcessor + - preprocess + - post_process_semantic_segmentation +DPTModel +[[autodoc]] DPTModel + - forward +DPTForDepthEstimation +[[autodoc]] DPTForDepthEstimation + - forward +DPTForSemanticSegmentation +[[autodoc]] DPTForSemanticSegmentation + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_6.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..90172945fb4be2f082b3f5597d1522dc00948f2b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_6.txt @@ -0,0 +1 @@ +Our experiments show that this architecture yields substantial improvements on dense prediction tasks, especially when a large amount of training data is available. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_7.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee421055fa1a56ab120cbb4497e9ac78d0db62bd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_7.txt @@ -0,0 +1 @@ +For monocular depth estimation, we observe an improvement of up to 28% in relative performance when compared to a state-of-the-art fully-convolutional network. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_8.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9a2295df608e23d9a15c51621d4965a90dce4a4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_8.txt @@ -0,0 +1 @@ +When applied to semantic segmentation, dense vision transformers set a new state of the art on ADE20K with 49.02% mIoU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_dpt/chunk_9.txt b/chunked/content_aware_chunking/model_doc_dpt/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..629751da7656b2191bfdc82d672d572146c6ed6a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_dpt/chunk_9.txt @@ -0,0 +1 @@ +We further show that the architecture can be fine-tuned on smaller datasets such as NYUv2, KITTI, and Pascal Context where it also sets the new state of the art. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..643b2bc2edf381464c0d115c23ff42d0ccc5c5f4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_10.txt @@ -0,0 +1 @@ +Extensive experiments show the superiority of EfficientFormer in performance and speed on mobile devices. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..59f6383499dcf7a48a02e3dd701c63e66b034c44 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_11.txt @@ -0,0 +1,3 @@ +Our fastest model, EfficientFormer-L1, achieves 79.2% top-1 accuracy on ImageNet-1K with only 1.6 ms inference latency on +iPhone 12 (compiled with CoreML), which { runs as fast as MobileNetV2×1.4 (1.6 ms, 74.7% top-1),} and our largest model, +EfficientFormer-L7, obtains 83.3% accuracy with only 7.0 ms latency. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..1830bcd874b93fdfe20b375e883217a7685433e3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_12.txt @@ -0,0 +1,2 @@ +Our work proves that properly designed transformers can +reach extremely low latency on mobile devices while maintaining high performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ead558db16eb4e2a33b1b54a5faa506ea6d995d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by novice03 and Bearnardd. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0012e995e3ed3ee3e6ff2de10fb44921451f4ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_15.txt @@ -0,0 +1 @@ +The TensorFlow version of this model was added by D-Roberts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..c56edb92a299c24cbf23e996b5e9ec629541ae4b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_16.txt @@ -0,0 +1,29 @@ +Documentation resources + +Image classification task guide + +EfficientFormerConfig +[[autodoc]] EfficientFormerConfig +EfficientFormerImageProcessor +[[autodoc]] EfficientFormerImageProcessor + - preprocess + +EfficientFormerModel +[[autodoc]] EfficientFormerModel + - forward +EfficientFormerForImageClassification +[[autodoc]] EfficientFormerForImageClassification + - forward +EfficientFormerForImageClassificationWithTeacher +[[autodoc]] EfficientFormerForImageClassificationWithTeacher + - forward + +TFEfficientFormerModel +[[autodoc]] TFEfficientFormerModel + - call +TFEfficientFormerForImageClassification +[[autodoc]] TFEfficientFormerForImageClassification + - call +TFEfficientFormerForImageClassificationWithTeacher +[[autodoc]] TFEfficientFormerForImageClassificationWithTeacher + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3ad4381500e0d1a70a84e7952ad09bf692c3bcb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_8.txt @@ -0,0 +1 @@ +Then we introduce a dimension-consistent pure transformer (without MobileNet blocks) as a design paradigm. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..f51fed96bf10b547bb20c6f47254e4371386510e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientformer/chunk_9.txt @@ -0,0 +1 @@ +Finally, we perform latency-driven slimming to get a series of final models dubbed EfficientFormer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientnet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientnet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..94f6afba3b0af7d2154f741e2e665bb83e3cb8ba --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_11.txt @@ -0,0 +1,11 @@ +EfficientNetConfig +[[autodoc]] EfficientNetConfig +EfficientNetImageProcessor +[[autodoc]] EfficientNetImageProcessor + - preprocess +EfficientNetModel +[[autodoc]] EfficientNetModel + - forward +EfficientNetForImageClassification +[[autodoc]] EfficientNetForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientnet/chunk_5.txt b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a8d176f595bdbbbf12f68b822009800cc03666d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_5.txt @@ -0,0 +1 @@ +We demonstrate the effectiveness of this method on scaling up MobileNets and ResNet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientnet/chunk_6.txt b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbbb22175695874a704312271ebffaf84b8a8794 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_6.txt @@ -0,0 +1 @@ +To go even further, we use neural architecture search to design a new baseline network and scale it up to obtain a family of models, called EfficientNets, which achieve much better accuracy and efficiency than previous ConvNets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientnet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..6578c894993758c06053312aa71146e5988fa330 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_7.txt @@ -0,0 +1 @@ +In particular, our EfficientNet-B7 achieves state-of-the-art 84.3% top-1 accuracy on ImageNet, while being 8.4x smaller and 6.1x faster on inference than the best existing ConvNet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientnet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b82856485bdd544f8c9375d4e890e0d6130af03 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_8.txt @@ -0,0 +1 @@ +Our EfficientNets also transfer well and achieve state-of-the-art accuracy on CIFAR-100 (91.7%), Flowers (98.8%), and 3 other transfer learning datasets, with an order of magnitude fewer parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_efficientnet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..773cf31a6bf61e91bd6c3fb37484f259b8192082 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_efficientnet/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by adirik. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_10.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c85064755d53e7d3bab3bc4a621ab4c87b3af39 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_10.txt @@ -0,0 +1,2 @@ +As a result, the contextual representations learned by our +approach substantially outperform the ones learned by BERT given the same model size, data, and compute. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_11.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c8c8e0f50118b4c373b7a285461055797c85998 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_11.txt @@ -0,0 +1,3 @@ +The gains are +particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained +using 30x more compute) on the GLUE natural language understanding benchmark. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_12.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f485ff9b1994418b2b82d316a772687b270e774 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_12.txt @@ -0,0 +1,3 @@ +Our approach also works well at scale, +where it performs comparably to RoBERTa and XLNet while using less than 1/4 of their compute and outperforms them when +using the same amount of compute. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_13.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..bba0214afcc802be2c905f184f5440f08eb91b3a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by lysandre. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_14.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_15.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2d331e08784046521664beb8b345e55c6c517b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_15.txt @@ -0,0 +1,3 @@ +Usage tips + +ELECTRA is the pretraining approach, therefore there is nearly no changes done to the underlying model: BERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_16.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ab861e0438ae89f5975e8df097a1a29028ac1ea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_16.txt @@ -0,0 +1,3 @@ +The + only change is the separation of the embedding size and the hidden size: the embedding size is generally smaller, + while the hidden size is larger. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_17.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f80a941a07819ae318062f0583eac43db126373 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_17.txt @@ -0,0 +1,2 @@ +An additional projection layer (linear) is used to project the embeddings from their + embedding size to the hidden size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_18.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee27b40955e230a3c39a661df3742afadfe88606 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_18.txt @@ -0,0 +1,2 @@ +In the case where the embedding size is the same as the hidden size, no projection + layer is used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_19.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd2b4913a422ebe4a96dca81a99421a2d1598a78 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_19.txt @@ -0,0 +1 @@ +ELECTRA is a transformer model pretrained with the use of another (small) masked language model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_20.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..feb3468d8d620adb4d1131f9dde677bb25562226 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_20.txt @@ -0,0 +1 @@ +The inputs are corrupted by that language model, which takes an input text that is randomly masked and outputs a text in which ELECTRA has to predict which token is an original and which one has been replaced. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_21.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..506ea22f04073357ae7b52388cd6d9e14878a814 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_21.txt @@ -0,0 +1 @@ +Like for GAN training, the small language model is trained for a few steps (but with the original texts as objective, not to fool the ELECTRA model like in a traditional GAN setting) then the ELECTRA model is trained for a few steps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_22.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1b114566c446ca8133859222e5d4d9e2f4443f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_22.txt @@ -0,0 +1,2 @@ +The ELECTRA checkpoints saved using Google Research's implementation + contain both the generator and discriminator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_23.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ba48e1b873dc0103bdef84885d49d2c972e34b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_23.txt @@ -0,0 +1,2 @@ +The conversion script requires the user to name which model to export + into the correct architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_24.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..962e8dcad54dbb19cc0bb08fd8a49cca8951ab42 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_24.txt @@ -0,0 +1,2 @@ +Once converted to the HuggingFace format, these checkpoints may be loaded into all + available ELECTRA models, however. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_25.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f646c175a0b5527654dc5180c82768ab2c9e7b6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_25.txt @@ -0,0 +1,4 @@ +This means that the discriminator may be loaded in the + [ElectraForMaskedLM] model, and the generator may be loaded in the + [ElectraForPreTraining] model (the classification head will be randomly initialized as it + doesn't exist in the generator). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_electra/chunk_26.txt b/chunked/content_aware_chunking/model_doc_electra/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..45df0f6690054736d5a0d642da66e053a002a24b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_electra/chunk_26.txt @@ -0,0 +1,90 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +ElectraConfig +[[autodoc]] ElectraConfig +ElectraTokenizer +[[autodoc]] ElectraTokenizer +ElectraTokenizerFast +[[autodoc]] ElectraTokenizerFast +Electra specific outputs +[[autodoc]] models.electra.modeling_electra.ElectraForPreTrainingOutput +[[autodoc]] models.electra.modeling_tf_electra.TFElectraForPreTrainingOutput + +ElectraModel +[[autodoc]] ElectraModel + - forward +ElectraForPreTraining +[[autodoc]] ElectraForPreTraining + - forward +ElectraForCausalLM +[[autodoc]] ElectraForCausalLM + - forward +ElectraForMaskedLM +[[autodoc]] ElectraForMaskedLM + - forward +ElectraForSequenceClassification +[[autodoc]] ElectraForSequenceClassification + - forward +ElectraForMultipleChoice +[[autodoc]] ElectraForMultipleChoice + - forward +ElectraForTokenClassification +[[autodoc]] ElectraForTokenClassification + - forward +ElectraForQuestionAnswering +[[autodoc]] ElectraForQuestionAnswering + - forward + +TFElectraModel +[[autodoc]] TFElectraModel + - call +TFElectraForPreTraining +[[autodoc]] TFElectraForPreTraining + - call +TFElectraForMaskedLM +[[autodoc]] TFElectraForMaskedLM + - call +TFElectraForSequenceClassification +[[autodoc]] TFElectraForSequenceClassification + - call +TFElectraForMultipleChoice +[[autodoc]] TFElectraForMultipleChoice + - call +TFElectraForTokenClassification +[[autodoc]] TFElectraForTokenClassification + - call +TFElectraForQuestionAnswering +[[autodoc]] TFElectraForQuestionAnswering + - call + +FlaxElectraModel +[[autodoc]] FlaxElectraModel + - call +FlaxElectraForPreTraining +[[autodoc]] FlaxElectraForPreTraining + - call +FlaxElectraForCausalLM +[[autodoc]] FlaxElectraForCausalLM + - call +FlaxElectraForMaskedLM +[[autodoc]] FlaxElectraForMaskedLM + - call +FlaxElectraForSequenceClassification +[[autodoc]] FlaxElectraForSequenceClassification + - call +FlaxElectraForMultipleChoice +[[autodoc]] FlaxElectraForMultipleChoice + - call +FlaxElectraForTokenClassification +[[autodoc]] FlaxElectraForTokenClassification + - call +FlaxElectraForQuestionAnswering +[[autodoc]] FlaxElectraForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encodec/chunk_10.txt b/chunked/content_aware_chunking/model_doc_encodec/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encodec/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encodec/chunk_11.txt b/chunked/content_aware_chunking/model_doc_encodec/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e72e6b3ab1bb21ea8874897d5edcd34cf8edf99 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encodec/chunk_11.txt @@ -0,0 +1,27 @@ +Usage example +Here is a quick example of how to encode and decode an audio using this model: +thon + +from datasets import load_dataset, Audio +from transformers import EncodecModel, AutoProcessor +librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +model = EncodecModel.from_pretrained("facebook/encodec_24khz") +processor = AutoProcessor.from_pretrained("facebook/encodec_24khz") +librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate)) +audio_sample = librispeech_dummy[-1]["audio"]["array"] +inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt") +encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"]) +audio_values = model.decode(encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"])[0] +or the equivalent with a forward pass +audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values + +EncodecConfig +[[autodoc]] EncodecConfig +EncodecFeatureExtractor +[[autodoc]] EncodecFeatureExtractor + - call +EncodecModel +[[autodoc]] EncodecModel + - decode + - encode + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encodec/chunk_5.txt b/chunked/content_aware_chunking/model_doc_encodec/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd70c7f63c2a3a82337982ef1d776bafee51984 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encodec/chunk_5.txt @@ -0,0 +1 @@ +Finally, we study how lightweight Transformer models can be used to further compress the obtained representation by up to 40%, while staying faster than real time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encodec/chunk_6.txt b/chunked/content_aware_chunking/model_doc_encodec/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..88c0ea9b50a969cb8778f7e844d9f6c71e7dadf0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encodec/chunk_6.txt @@ -0,0 +1 @@ +We provide a detailed description of the key design choices of the proposed model including: training objective, architectural changes and a study of various perceptual loss functions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encodec/chunk_7.txt b/chunked/content_aware_chunking/model_doc_encodec/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b83fb9937f9ba7c52a0e180769235118536a79c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encodec/chunk_7.txt @@ -0,0 +1 @@ +We present an extensive subjective evaluation (MUSHRA tests) together with an ablation study for a range of bandwidths and audio domains, including speech, noisy-reverberant speech, and music. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encodec/chunk_8.txt b/chunked/content_aware_chunking/model_doc_encodec/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a16318d38b3715e232adee43f271b4cce8f98917 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encodec/chunk_8.txt @@ -0,0 +1 @@ +Our approach is superior to the baselines methods across all evaluated settings, considering both 24 kHz monophonic and 48 kHz stereophonic audio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encodec/chunk_9.txt b/chunked/content_aware_chunking/model_doc_encodec/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..8375bba17034490836999c484b51a793263f767f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encodec/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by Matthijs, Patrick Von Platen and Arthur Zucker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_14.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd7d22e2c0189856696b62696c816fec60e2b622 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_14.txt @@ -0,0 +1 @@ +Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_15.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba6a3c212c3a8e013b5cc8d463f4c8c5f68fc217 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_15.txt @@ -0,0 +1 @@ +Initializing [EncoderDecoderModel] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in the Warm-starting-encoder-decoder blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_16.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6303b7cf89d983d5ae92069d5921ddc4bcc327ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_16.txt @@ -0,0 +1 @@ +To do so, the EncoderDecoderModel class provides a [EncoderDecoderModel.from_encoder_decoder_pretrained] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_17.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..b03d806eaa28ebd782826790466f2f7e555ac963 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_17.txt @@ -0,0 +1,7 @@ +thon + +from transformers import EncoderDecoderModel, BertTokenizer +tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") +model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased") + +Loading an existing EncoderDecoderModel checkpoint and perform inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_18.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d9a83996d8d7871b4c4db9508641e334fd25a67 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_18.txt @@ -0,0 +1 @@ +To load fine-tuned checkpoints of the EncoderDecoderModel class, [EncoderDecoderModel] provides the from_pretrained() method just like any other model architecture in Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_19.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..e55dc314fb376c66035398fd8c7b4ace794cafd4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_19.txt @@ -0,0 +1 @@ +To perform inference, one uses the [generate] method, which allows to autoregressively generate text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_20.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..f29d5c272c8190c62073648395190c4babe6a276 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_20.txt @@ -0,0 +1 @@ +This method supports various forms of decoding, such as greedy, beam search and multinomial sampling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_21.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..0cef1a969636b619c06b847f071c53c632e36bf2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_21.txt @@ -0,0 +1,10 @@ +thon + +from transformers import AutoTokenizer, EncoderDecoderModel +load a fine-tuned seq2seq model and corresponding tokenizer +model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail") +tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail") +let's perform inference on a long piece of text +ARTICLE_TO_SUMMARIZE = ( + "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + "amid dry conditions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_22.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d1a7e76bf1ad6e54de61c163b4222161db70496 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_22.txt @@ -0,0 +1 @@ +The aim is to reduce the risk of wildfires. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_23.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ccffe33e9fe8143ac46d84ab906b0bbc9c16bf8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_23.txt @@ -0,0 +1,2 @@ +Nearly 800 thousand customers were " + "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_24.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..7db3fa97e71fa75b8c79526d927c66bb17d2d808 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_24.txt @@ -0,0 +1,7 @@ +) +input_ids = tokenizer(ARTICLE_TO_SUMMARIZE, return_tensors="pt").input_ids +autoregressively generate summary (uses greedy decoding by default) +generated_ids = model.generate(input_ids) +generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +print(generated_text) +nearly 800 thousand customers were affected by the shutoffs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_25.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7d8a0d41d5b0fb5015e4bd9179e44038aad9b46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_25.txt @@ -0,0 +1 @@ +the aim is to reduce the risk of wildfires. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_26.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..e26630caef1063f17b5e482812de9d378849f3b7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_26.txt @@ -0,0 +1 @@ +nearly 800, 000 customers were expected to be affected by high winds amid dry conditions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_27.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..aea56941bc88e139991ea1048d2762b4cd6b0d27 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_27.txt @@ -0,0 +1 @@ +pg & e said it scheduled the blackouts to last through at least midday tomorrow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_28.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..d277bf77dffa23c00834f1e366ea871cf1058b09 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_28.txt @@ -0,0 +1 @@ +Loading a PyTorch checkpoint into TFEncoderDecoderModel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_29.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4dff2f7af1d438ba7d0edea6f42c0c90f85da6a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_29.txt @@ -0,0 +1,2 @@ +[TFEncoderDecoderModel.from_pretrained] currently doesn't support initializing the model from a +pytorch checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_30.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4276f4671a622acf63e86454add1a9998c15313 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_30.txt @@ -0,0 +1 @@ +Passing from_pt=True to this method will throw an exception. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_31.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdc6465684a5e42af2eecb569fb56135cd11a763 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_31.txt @@ -0,0 +1,13 @@ +If there are only pytorch +checkpoints for a particular encoder-decoder model, a workaround is: +thon + +a workaround to load from pytorch checkpoint +from transformers import EncoderDecoderModel, TFEncoderDecoderModel +_model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16") +_model.encoder.save_pretrained("./encoder") +_model.decoder.save_pretrained("./decoder") +model = TFEncoderDecoderModel.from_encoder_decoder_pretrained( + "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True + ) +This is only for copying some specific attributes of this particular model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_32.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..067080d29079a0cd0961d8f839448903cfaa1fa7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_32.txt @@ -0,0 +1,4 @@ +model.config = _model.config + +Training +Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_33.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..0980876e6a9dd6cd0cd0f16cf47225634309ee7b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_33.txt @@ -0,0 +1,3 @@ +As you can see, only 2 inputs are required for the model in order to compute a loss: input_ids (which are the +input_ids of the encoded input sequence) and labels (which are the input_ids of the encoded +target sequence). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_34.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef8895ffeb9d250156470a8830645d97550978ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_34.txt @@ -0,0 +1,9 @@ +thon + +from transformers import BertTokenizer, EncoderDecoderModel +tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") +model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased") +model.config.decoder_start_token_id = tokenizer.cls_token_id +model.config.pad_token_id = tokenizer.pad_token_id +input_ids = tokenizer( + "The tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_35.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d28b99d39f837c348ab76d07f814319e6943db7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_35.txt @@ -0,0 +1 @@ +Its base is square, measuring 125 metres (410 ft) on each side.During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_36.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..af5a6b1350daea7df0de1723bcc0608fc3b89031 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_36.txt @@ -0,0 +1 @@ +It was the first structure to reach a height of 300 metres. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_37.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d908feca64549f713d6c19b386212c68e928b7a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_37.txt @@ -0,0 +1 @@ +Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft).Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_38.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0eaed5c8584553b66f6cd8e6411d77b8d6c99d6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_38.txt @@ -0,0 +1,5 @@ +", + return_tensors="pt", + ).input_ids +labels = tokenizer( + "the eiffel tower surpassed the washington monument to become the tallest structure in the world. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_39.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..de7c75ba208ad6c2a5dd03f66c687d829db561e8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_39.txt @@ -0,0 +1 @@ +it was the first structure to reach a height of 300 metres in paris in 1930. it is now taller than the chrysler building by 5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_40.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..26936433f874985e14eae986a91beea7591947d3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_40.txt @@ -0,0 +1 @@ +2 metres ( 17 ft ) and is the second tallest free - standing structure in paris. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_41.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3ecc56e4919fb14bb3a9f478e702adee36c12d2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_41.txt @@ -0,0 +1,7 @@ +", + return_tensors="pt", + ).input_ids +the forward function automatically creates the correct decoder_input_ids +loss = model(input_ids=input_ids, labels=labels).loss + +Detailed colab for training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_42.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..649897ffdb873807ea08b834f0650ff587d9718e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_42.txt @@ -0,0 +1 @@ +This model was contributed by thomwolf. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_43.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cac88ea6f186c7ee6c1bb761c3a083de704bf7f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_43.txt @@ -0,0 +1,2 @@ +This model's TensorFlow and Flax versions +were contributed by ydshieh. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_44.txt b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e9267c628ca3ab6d0a9fb397490cb6e2e0b2722 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_encoder-decoder/chunk_44.txt @@ -0,0 +1,17 @@ +EncoderDecoderConfig +[[autodoc]] EncoderDecoderConfig + +EncoderDecoderModel +[[autodoc]] EncoderDecoderModel + - forward + - from_encoder_decoder_pretrained + +TFEncoderDecoderModel +[[autodoc]] TFEncoderDecoderModel + - call + - from_encoder_decoder_pretrained + +FlaxEncoderDecoderModel +[[autodoc]] FlaxEncoderDecoderModel + - call + - from_encoder_decoder_pretrained \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie/chunk_2.txt b/chunked/content_aware_chunking/model_doc_ernie/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..924239ae3e2b6dfca5d6e5d9d221e1d05f1b95d5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie/chunk_2.txt @@ -0,0 +1,22 @@ +Usage example +Take ernie-1.0-base-zh as an example: +Python +from transformers import AutoTokenizer, AutoModel +tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-1.0-base-zh") +model = AutoModel.from_pretrained("nghuyong/ernie-1.0-base-zh") +Model checkpoints +| Model Name | Language | Description | +|:-------------------:|:--------:|:-------------------------------:| +| ernie-1.0-base-zh | Chinese | Layer:12, Heads:12, Hidden:768 | +| ernie-2.0-base-en | English | Layer:12, Heads:12, Hidden:768 | +| ernie-2.0-large-en | English | Layer:24, Heads:16, Hidden:1024 | +| ernie-3.0-base-zh | Chinese | Layer:12, Heads:12, Hidden:768 | +| ernie-3.0-medium-zh | Chinese | Layer:6, Heads:12, Hidden:768 | +| ernie-3.0-mini-zh | Chinese | Layer:6, Heads:12, Hidden:384 | +| ernie-3.0-micro-zh | Chinese | Layer:4, Heads:12, Hidden:384 | +| ernie-3.0-nano-zh | Chinese | Layer:4, Heads:12, Hidden:312 | +| ernie-health-zh | Chinese | Layer:12, Heads:12, Hidden:768 | +| ernie-gram-zh | Chinese | Layer:12, Heads:12, Hidden:768 | +You can find all the supported models from huggingface's model hub: huggingface.co/nghuyong, and model details from paddle's official +repo: PaddleNLP +and ERNIE. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie/chunk_3.txt b/chunked/content_aware_chunking/model_doc_ernie/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d743905441d377bcffcfcc79ebe44dd962c3892 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie/chunk_3.txt @@ -0,0 +1,41 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +ErnieConfig +[[autodoc]] ErnieConfig + - all +Ernie specific outputs +[[autodoc]] models.ernie.modeling_ernie.ErnieForPreTrainingOutput +ErnieModel +[[autodoc]] ErnieModel + - forward +ErnieForPreTraining +[[autodoc]] ErnieForPreTraining + - forward +ErnieForCausalLM +[[autodoc]] ErnieForCausalLM + - forward +ErnieForMaskedLM +[[autodoc]] ErnieForMaskedLM + - forward +ErnieForNextSentencePrediction +[[autodoc]] ErnieForNextSentencePrediction + - forward +ErnieForSequenceClassification +[[autodoc]] ErnieForSequenceClassification + - forward +ErnieForMultipleChoice +[[autodoc]] ErnieForMultipleChoice + - forward +ErnieForTokenClassification +[[autodoc]] ErnieForTokenClassification + - forward +ErnieForQuestionAnswering +[[autodoc]] ErnieForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie_m/chunk_10.txt b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7866b09a8b0f5719415f5164ae85115032f0f89d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_10.txt @@ -0,0 +1,3 @@ +Usage tips + +Ernie-M is a BERT-like model so it is a stacked Transformer Encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie_m/chunk_11.txt b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a5c37e3cb1284ce39f101caa9dfd141d558d417 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_11.txt @@ -0,0 +1 @@ +Instead of using MaskedLM for pretraining (like BERT) the authors used two novel techniques: Cross-attention Masked Language Modeling and Back-translation Masked Language Modeling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie_m/chunk_12.txt b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..b15f1d3cbdbf034c0b5b8e64c1dc31984b876920 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_12.txt @@ -0,0 +1 @@ +For now these two LMHead objectives are not implemented here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie_m/chunk_13.txt b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..83a0f29d1dcabeddc7421b172c78fec0642fed94 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_13.txt @@ -0,0 +1 @@ +It is a multilingual language model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie_m/chunk_14.txt b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..af48a93472435a61fc24b500f64f26add711bc9e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_14.txt @@ -0,0 +1 @@ +Next Sentence Prediction was not used in pretraining process. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie_m/chunk_15.txt b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..c56fa026e2971fe6124200ab82750c4aba7a2918 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_15.txt @@ -0,0 +1,33 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Multiple choice task guide + +ErnieMConfig +[[autodoc]] ErnieMConfig +ErnieMTokenizer +[[autodoc]] ErnieMTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +ErnieMModel +[[autodoc]] ErnieMModel + - forward +ErnieMForSequenceClassification +[[autodoc]] ErnieMForSequenceClassification + - forward +ErnieMForMultipleChoice +[[autodoc]] ErnieMForMultipleChoice + - forward +ErnieMForTokenClassification +[[autodoc]] ErnieMForTokenClassification + - forward +ErnieMForQuestionAnswering +[[autodoc]] ErnieMForQuestionAnswering + - forward +ErnieMForInformationExtraction +[[autodoc]] ErnieMForInformationExtraction + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie_m/chunk_6.txt b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d5e8c3acf2170b9084458220b2ef69516856347 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_6.txt @@ -0,0 +1 @@ +We generate pseudo-parallel sentence pairs on a monolingual corpus to enable the learning of semantic alignments between different languages, thereby enhancing the semantic modeling of cross-lingual models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie_m/chunk_7.txt b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ace38508f55aec19d063af9ac58ef3ab4956a9b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_7.txt @@ -0,0 +1 @@ +Experimental results show that ERNIE-M outperforms existing cross-lingual models and delivers new state-of-the-art results in various cross-lingual downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie_m/chunk_8.txt b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..da25758841c5a2df6576b576e5390e7ad79a3e28 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by Susnato Dhar. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ernie_m/chunk_9.txt b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ernie_m/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_12.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4ab70775eb930417bb9159305c63b3b19c61592 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_12.txt @@ -0,0 +1,2 @@ +The resulting model contains information about biological +properties in its representations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_13.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6aed973e88f139ff44e0df889161d1da9f789416 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_13.txt @@ -0,0 +1 @@ +The representations are learned from sequence data alone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_14.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fa46222266017bece4fe90bf623010123939dd9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_14.txt @@ -0,0 +1,3 @@ +The learned representation +space has a multiscale organization reflecting structure from the level of biochemical properties of amino acids to +remote homology of proteins. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_15.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bda8fadf3f2011634cdc1c94a22c72061e97a9a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_15.txt @@ -0,0 +1,2 @@ +Information about secondary and tertiary structure is encoded in the representations and +can be identified by linear projections. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_16.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e07e9b81e0e3e00cf5bbf5800144954c8f6f1d5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_16.txt @@ -0,0 +1,3 @@ +Representation learning produces features that generalize across a range of +applications, enabling state-of-the-art supervised prediction of mutational effect and secondary structure and +improving state-of-the-art features for long-range contact prediction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_17.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..504e4bee3e253d16360ccace60112ab8a92b950a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_17.txt @@ -0,0 +1,4 @@ +The abstract from +"Language models of protein sequences at the scale of evolution enable accurate structure prediction" is +Large language models have recently been shown to develop emergent capabilities with scale, going beyond +simple pattern matching to perform higher level reasoning and generate lifelike images and text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_18.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d1e12eb5769bb7f57ae0c0d1a4f2b597528b95d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_18.txt @@ -0,0 +1,3 @@ +While +language models trained on protein sequences have been studied at a smaller scale, little is known about +what they learn about biology as they are scaled up. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_19.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..53b5947d7a94cb58d02556ff1fe2f559f62f0688 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_19.txt @@ -0,0 +1,2 @@ +In this work we train models up to 15 billion parameters, +the largest language models of proteins to be evaluated to date. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_20.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e37464ffdf97c50744a0af68350879d2cdbfe65 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_20.txt @@ -0,0 +1,3 @@ +We find that as models are scaled they learn +information enabling the prediction of the three-dimensional structure of a protein at the resolution of +individual atoms. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_21.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..0246f48c183a1be88b099a55bbe7d647f7983b2b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_21.txt @@ -0,0 +1,2 @@ +We present ESMFold for high accuracy end-to-end atomic level structure prediction directly +from the individual sequence of a protein. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_22.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..2638d9bdbc3a9e7caea131958ac9ff00dcff315f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_22.txt @@ -0,0 +1,2 @@ +ESMFold has similar accuracy to AlphaFold2 and RoseTTAFold for +sequences with low perplexity that are well understood by the language model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_23.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..8301d02e31b0bb82bab0bfaff0fbf0311e027a93 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_23.txt @@ -0,0 +1,3 @@ +ESMFold inference is an +order of magnitude faster than AlphaFold2, enabling exploration of the structural space of metagenomic +proteins in practical timescales. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_24.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..3eb45618e847a38c13f8cfd846bf2565ea0754fb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_24.txt @@ -0,0 +1,2 @@ +The original code can be found here and was +was developed by the Fundamental AI Research team at Meta AI. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_25.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..e97d12685ecc9e0a15e848d4363057f098dedd0c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_25.txt @@ -0,0 +1,2 @@ +ESM-1b, ESM-1v and ESM-2 were contributed to huggingface by jasonliu +and Matt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_26.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e5239b199fea5884d182d4c880e14a69918fc5c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_26.txt @@ -0,0 +1,3 @@ +ESMFold was contributed to huggingface by Matt and +Sylvain, with a big thank you to Nikita Smetanin, Roshan Rao and Tom Sercu for their +help throughout the process! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_27.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..113ff9688e6b8961adf937fd8bc636b89aa80bbd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_27.txt @@ -0,0 +1,3 @@ +Usage tips + +ESM models are trained with a masked language modeling (MLM) objective. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_28.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..94cb9214ee333689dfc80e04cb776162221dedd9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_28.txt @@ -0,0 +1 @@ +The HuggingFace port of ESMFold uses portions of the openfold library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_29.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..4002e8e17b0d74472150b7847e4e66980fe2a49c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_29.txt @@ -0,0 +1 @@ +The openfold library is licensed under the Apache License 2.0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_esm/chunk_30.txt b/chunked/content_aware_chunking/model_doc_esm/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e137c9ef70cd049ab11463dd13dc9b1f96c201c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_esm/chunk_30.txt @@ -0,0 +1,44 @@ +Resources + +Text classification task guide +Token classification task guide +Masked language modeling task guide + +EsmConfig +[[autodoc]] EsmConfig + - all +EsmTokenizer +[[autodoc]] EsmTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +EsmModel +[[autodoc]] EsmModel + - forward +EsmForMaskedLM +[[autodoc]] EsmForMaskedLM + - forward +EsmForSequenceClassification +[[autodoc]] EsmForSequenceClassification + - forward +EsmForTokenClassification +[[autodoc]] EsmForTokenClassification + - forward +EsmForProteinFolding +[[autodoc]] EsmForProteinFolding + - forward + +TFEsmModel +[[autodoc]] TFEsmModel + - call +TFEsmForMaskedLM +[[autodoc]] TFEsmForMaskedLM + - call +TFEsmForSequenceClassification +[[autodoc]] TFEsmForSequenceClassification + - call +TFEsmForTokenClassification +[[autodoc]] TFEsmForTokenClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_falcon/chunk_10.txt b/chunked/content_aware_chunking/model_doc_falcon/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..313523a7877aa829088b614b37898d8f3868cdd1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_falcon/chunk_10.txt @@ -0,0 +1,2 @@ +To use this script, simply call it with +python convert_custom_code_checkpoint.py --checkpoint_dir my_model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_falcon/chunk_11.txt b/chunked/content_aware_chunking/model_doc_falcon/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf228b17ab8e5a7c3f9922fad86420ccbfa3418b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_falcon/chunk_11.txt @@ -0,0 +1,2 @@ +This will convert your checkpoint in-place, and +you can immediately load it from the directory afterwards with e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_falcon/chunk_12.txt b/chunked/content_aware_chunking/model_doc_falcon/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6067449e3e0c187c8708f01733ea83da03ba432 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_falcon/chunk_12.txt @@ -0,0 +1 @@ +from_pretrained(). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_falcon/chunk_13.txt b/chunked/content_aware_chunking/model_doc_falcon/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c41dc54d05939de04e7ed8ac97d45d3f8464020 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_falcon/chunk_13.txt @@ -0,0 +1,2 @@ +If your model hasn't been +uploaded to the Hub, we recommend making a backup before attempting the conversion, just in case! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_falcon/chunk_14.txt b/chunked/content_aware_chunking/model_doc_falcon/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdc59cab888e82fb336bad2f822c36a3a2efedf0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_falcon/chunk_14.txt @@ -0,0 +1,18 @@ +FalconConfig +[[autodoc]] FalconConfig + - all +FalconModel +[[autodoc]] FalconModel + - forward +FalconForCausalLM +[[autodoc]] FalconForCausalLM + - forward +FalconForSequenceClassification +[[autodoc]] FalconForSequenceClassification + - forward +FalconForTokenClassification +[[autodoc]] FalconForTokenClassification + - forward +FalconForQuestionAnswering +[[autodoc]] FalconForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_falcon/chunk_5.txt b/chunked/content_aware_chunking/model_doc_falcon/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..41de99bad885f5dec77c0c749d04aca9e29c35be --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_falcon/chunk_5.txt @@ -0,0 +1,2 @@ +Falcon models are (as of 2023) some of the largest and most powerful open-source language models, +and consistently rank highly in the OpenLLM leaderboard. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_falcon/chunk_6.txt b/chunked/content_aware_chunking/model_doc_falcon/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..135454cbdc2d636b0dcf952b37aa2e295de5b974 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_falcon/chunk_6.txt @@ -0,0 +1,3 @@ +Converting custom checkpoints + +Falcon models were initially added to the Hugging Face Hub as custom code checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_falcon/chunk_7.txt b/chunked/content_aware_chunking/model_doc_falcon/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..16fc404dd52a8d827e37ca4532b30392f771b3a5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_falcon/chunk_7.txt @@ -0,0 +1,2 @@ +However, Falcon is now fully +supported in the Transformers library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_falcon/chunk_8.txt b/chunked/content_aware_chunking/model_doc_falcon/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b02266341b91d62bbfd79f9567193488111070d7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_falcon/chunk_8.txt @@ -0,0 +1,3 @@ +If you fine-tuned a model from a custom code checkpoint, we recommend converting +your checkpoint to the new in-library format, as this should give significant improvements to stability and +performance, especially for generation, as well as removing the need to use trust_remote_code=True! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_falcon/chunk_9.txt b/chunked/content_aware_chunking/model_doc_falcon/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..f831e52d43aa5fd14fc1e70764e650900919f05f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_falcon/chunk_9.txt @@ -0,0 +1,4 @@ +You can convert custom code checkpoints to full Transformers checkpoints using the convert_custom_code_checkpoint.py +script located in the +Falcon model directory +of the Transformers library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..1855fde83252dc7d270e565f0745d6fb7f5b9823 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_11.txt @@ -0,0 +1,2 @@ +🤗 Model Architecture +FastSpeech2's general structure with a Mel-spectrogram decoder was implemented, and the traditional transformer blocks were replaced with with conformer blocks as done in the ESPnet library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba1826b5e3728259196ef26b85bd39d79ce627e3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_12.txt @@ -0,0 +1,8 @@ +FastSpeech2 Model Architecture + +Conformer Blocks + +Convolution Module + +🤗 Transformers Usage +You can run FastSpeech2Conformer locally with the 🤗 Transformers library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..819608f81ff6c7f33cd4e0eed07d771949eb0e81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_13.txt @@ -0,0 +1,12 @@ +First install the 🤗 Transformers library, g2p-en: + +pip install --upgrade pip +pip install --upgrade transformers g2p-en + +Run inference via the Transformers modelling code with the model and hifigan separately + +thon +from transformers import FastSpeech2ConformerTokenizer, FastSpeech2ConformerModel, FastSpeech2ConformerHifiGan +import soundfile as sf +tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer") +inputs = tokenizer("Hello, my dog is cute. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..3516a3836d06030e9c5f90bcf5bed570b9ced337 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_14.txt @@ -0,0 +1,16 @@ +", return_tensors="pt") +input_ids = inputs["input_ids"] +model = FastSpeech2ConformerModel.from_pretrained("espnet/fastspeech2_conformer") +output_dict = model(input_ids, return_dict=True) +spectrogram = output_dict["spectrogram"] +hifigan = FastSpeech2ConformerHifiGan.from_pretrained("espnet/fastspeech2_conformer_hifigan") +waveform = hifigan(spectrogram) +sf.write("speech.wav", waveform.squeeze().detach().numpy(), samplerate=22050) + +Run inference via the Transformers modelling code with the model and hifigan combined + +thon +from transformers import FastSpeech2ConformerTokenizer, FastSpeech2ConformerWithHifiGan +import soundfile as sf +tokenizer = FastSpeech2ConformerTokenizer.from_pretrained("espnet/fastspeech2_conformer") +inputs = tokenizer("Hello, my dog is cute. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..c24cc5847bc089f903c52bc71345b375fe47ef1b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_15.txt @@ -0,0 +1,15 @@ +", return_tensors="pt") +input_ids = inputs["input_ids"] +model = FastSpeech2ConformerWithHifiGan.from_pretrained("espnet/fastspeech2_conformer_with_hifigan") +output_dict = model(input_ids, return_dict=True) +waveform = output_dict["waveform"] +sf.write("speech.wav", waveform.squeeze().detach().numpy(), samplerate=22050) + +Run inference with a pipeline and specify which vocoder to use +thon +from transformers import pipeline, FastSpeech2ConformerHifiGan +import soundfile as sf + +vocoder = FastSpeech2ConformerHifiGan.from_pretrained("espnet/fastspeech2_conformer_hifigan") +synthesiser = pipeline(model="espnet/fastspeech2_conformer", vocoder=vocoder) +speech = synthesiser("Hello, my dog is cooler than you!") \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..06ea36f39852baf7c17d27919512420f3fb8154f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_16.txt @@ -0,0 +1,23 @@ +sf.write("speech.wav", speech["audio"].squeeze(), samplerate=speech["sampling_rate"]) + +FastSpeech2ConformerConfig +[[autodoc]] FastSpeech2ConformerConfig +FastSpeech2ConformerHifiGanConfig +[[autodoc]] FastSpeech2ConformerHifiGanConfig +FastSpeech2ConformerWithHifiGanConfig +[[autodoc]] FastSpeech2ConformerWithHifiGanConfig +FastSpeech2ConformerTokenizer +[[autodoc]] FastSpeech2ConformerTokenizer + - call + - save_vocabulary + - decode + - batch_decode +FastSpeech2ConformerModel +[[autodoc]] FastSpeech2ConformerModel + - forward +FastSpeech2ConformerHifiGan +[[autodoc]] FastSpeech2ConformerHifiGan + - forward +FastSpeech2ConformerWithHifiGan +[[autodoc]] FastSpeech2ConformerWithHifiGan + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..46d0d44dedc47d47737919c96b6353e076235fce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_8.txt @@ -0,0 +1 @@ +Audio samples are available at https://speechresearch.github.io/fastspeech2/. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b09e478079a4d124ca77857aca7346c28c607dc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fastspeech2_conformer/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by Connor Henderson. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flan-t5/chunk_3.txt b/chunked/content_aware_chunking/model_doc_flan-t5/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..259b9a2fdf2cc00ef761f1878e58809b024e55bf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flan-t5/chunk_3.txt @@ -0,0 +1 @@ +The original checkpoints can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flan-t5/chunk_4.txt b/chunked/content_aware_chunking/model_doc_flan-t5/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..718ebd12174a64002b731eb4c9f3a71a9a4948c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flan-t5/chunk_4.txt @@ -0,0 +1 @@ +Refer to T5's documentation page for all API reference, code examples and notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flan-t5/chunk_5.txt b/chunked/content_aware_chunking/model_doc_flan-t5/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..e96cf27cfbe038c3874eb69cde9fc68825b24be5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flan-t5/chunk_5.txt @@ -0,0 +1 @@ +For more details regarding training and evaluation of the FLAN-T5, refer to the model card. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f888fc513537fbbf8381589763da721aa81c6d2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_10.txt @@ -0,0 +1,2 @@ +Running on low resource devices +The model is pretty heavy (~40GB in half precision) so if you just want to run the model, make sure you load your model in 8bit, and use device_map="auto" to make sure you don't have any OOM issue! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..308c6d3846a738515a3523253d1cef61cdca5eda --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_11.txt @@ -0,0 +1,9 @@ +thon + +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-ul2", load_in_8bit=True, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("google/flan-ul2") +inputs = tokenizer("A step by step recipe to make bolognese pasta:", return_tensors="pt") +outputs = model.generate(**inputs) +print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) +['In a large skillet, brown the ground beef and onion over medium heat. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..61f7d45518d91a939086c09950b85c904bc97e56 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_12.txt @@ -0,0 +1,3 @@ +Add the garlic'] + +Refer to T5's documentation page for API reference, tips, code examples and notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_5.txt b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a78627dad388fde5a259614f3ecb0356c0b4025 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_5.txt @@ -0,0 +1 @@ +The original UL2 model also had mode switch tokens that was rather mandatory to get good performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_6.txt b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..408a1fd899165211247bb7a0b30e39a650c7f98b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_6.txt @@ -0,0 +1 @@ +However, they were a little cumbersome as this requires often some changes during inference or finetuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_7.txt b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..cee84a122d632319b59e2c394726427c23422b2c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_7.txt @@ -0,0 +1 @@ +In this update/change, we continue training UL2 20B for an additional 100k steps (with small batch) to forget “mode tokens†before applying Flan instruction tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..4abdc0c04442060fee316844dac25803c48bdc36 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_8.txt @@ -0,0 +1 @@ +This Flan-UL2 checkpoint does not require mode tokens anymore. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..206392fac0c4e5d2faf99f3e1c72703e5c9d8071 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flan-ul2/chunk_9.txt @@ -0,0 +1,3 @@ +Google has released the following variants: + +The original checkpoints can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flaubert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_flaubert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flaubert/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flaubert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_flaubert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..fef865f5324c4fbefefbf223b5af427316f929a7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flaubert/chunk_11.txt @@ -0,0 +1,2 @@ +Tips: +- Like RoBERTa, without the sentence ordering prediction (so just trained on the MLM objective). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flaubert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_flaubert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..50c2b04b5d5fb9e77a1f903074783449021f36b6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flaubert/chunk_12.txt @@ -0,0 +1,53 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +FlaubertConfig +[[autodoc]] FlaubertConfig +FlaubertTokenizer +[[autodoc]] FlaubertTokenizer + +FlaubertModel +[[autodoc]] FlaubertModel + - forward +FlaubertWithLMHeadModel +[[autodoc]] FlaubertWithLMHeadModel + - forward +FlaubertForSequenceClassification +[[autodoc]] FlaubertForSequenceClassification + - forward +FlaubertForMultipleChoice +[[autodoc]] FlaubertForMultipleChoice + - forward +FlaubertForTokenClassification +[[autodoc]] FlaubertForTokenClassification + - forward +FlaubertForQuestionAnsweringSimple +[[autodoc]] FlaubertForQuestionAnsweringSimple + - forward +FlaubertForQuestionAnswering +[[autodoc]] FlaubertForQuestionAnswering + - forward + +TFFlaubertModel +[[autodoc]] TFFlaubertModel + - call +TFFlaubertWithLMHeadModel +[[autodoc]] TFFlaubertWithLMHeadModel + - call +TFFlaubertForSequenceClassification +[[autodoc]] TFFlaubertForSequenceClassification + - call +TFFlaubertForMultipleChoice +[[autodoc]] TFFlaubertForMultipleChoice + - call +TFFlaubertForTokenClassification +[[autodoc]] TFFlaubertForTokenClassification + - call +TFFlaubertForQuestionAnsweringSimple +[[autodoc]] TFFlaubertForQuestionAnsweringSimple + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flaubert/chunk_5.txt b/chunked/content_aware_chunking/model_doc_flaubert/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6c413837b1fa4f618615ddf552745688173e8bc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flaubert/chunk_5.txt @@ -0,0 +1,2 @@ +In this paper, we introduce and share FlauBERT, a model learned on a very large and +heterogeneous French corpus. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flaubert/chunk_6.txt b/chunked/content_aware_chunking/model_doc_flaubert/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd279b457197a8b6aab27c62758328dd4a98077f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flaubert/chunk_6.txt @@ -0,0 +1,2 @@ +Models of different sizes are trained using the new CNRS (French National Centre for +Scientific Research) Jean Zay supercomputer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flaubert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_flaubert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..a74a638ec1cf4a6ec3e6c82269246f415a1e831f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flaubert/chunk_7.txt @@ -0,0 +1,3 @@ +We apply our French language models to diverse NLP tasks (text +classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most of the +time they outperform other pretraining approaches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flaubert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_flaubert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d030fb9172481933b9d5357882ce38bd30f717eb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flaubert/chunk_8.txt @@ -0,0 +1,3 @@ +Different versions of FlauBERT as well as a unified evaluation +protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared to the research +community for further reproducible experiments in French NLP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flaubert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_flaubert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..782b9cdabfe7c76c5a2bebf3d66aa5bd36d3c87c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flaubert/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by formiel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flava/chunk_5.txt b/chunked/content_aware_chunking/model_doc_flava/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c276941f7621a69732ed6e64bb8e1c2484747e6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flava/chunk_5.txt @@ -0,0 +1,2 @@ +We introduce FLAVA as such a model and demonstrate +impressive performance on a wide range of 35 tasks spanning these target modalities. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flava/chunk_6.txt b/chunked/content_aware_chunking/model_doc_flava/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e125f91a75f0208a7f0c65f5b564b59bdf7e8cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flava/chunk_6.txt @@ -0,0 +1 @@ +This model was contributed by aps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flava/chunk_7.txt b/chunked/content_aware_chunking/model_doc_flava/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flava/chunk_7.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_flava/chunk_8.txt b/chunked/content_aware_chunking/model_doc_flava/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..01757bd8a9c0467ba71091adb5578500e4aa0c77 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_flava/chunk_8.txt @@ -0,0 +1,39 @@ +FlavaConfig +[[autodoc]] FlavaConfig +FlavaTextConfig +[[autodoc]] FlavaTextConfig +FlavaImageConfig +[[autodoc]] FlavaImageConfig +FlavaMultimodalConfig +[[autodoc]] FlavaMultimodalConfig +FlavaImageCodebookConfig +[[autodoc]] FlavaImageCodebookConfig +FlavaProcessor +[[autodoc]] FlavaProcessor +FlavaFeatureExtractor +[[autodoc]] FlavaFeatureExtractor +FlavaImageProcessor +[[autodoc]] FlavaImageProcessor + - preprocess +FlavaForPreTraining +[[autodoc]] FlavaForPreTraining + - forward +FlavaModel +[[autodoc]] FlavaModel + - forward + - get_text_features + - get_image_features +FlavaImageCodebook +[[autodoc]] FlavaImageCodebook + - forward + - get_codebook_indices + - get_codebook_probs +FlavaTextModel +[[autodoc]] FlavaTextModel + - forward +FlavaImageModel +[[autodoc]] FlavaImageModel + - forward +FlavaMultimodalModel +[[autodoc]] FlavaMultimodalModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fnet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_fnet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fnet/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fnet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_fnet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..76e471a254aa4d98c948bebf5239e37e631baa11 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fnet/chunk_11.txt @@ -0,0 +1,2 @@ +Usage tips +The model was trained without an attention mask as it is based on Fourier Transform. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fnet/chunk_12.txt b/chunked/content_aware_chunking/model_doc_fnet/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..771c2e4609e8df3c113a04290ead3a38d039dc69 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fnet/chunk_12.txt @@ -0,0 +1,2 @@ +The model was trained with +maximum sequence length 512 which includes pad tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fnet/chunk_13.txt b/chunked/content_aware_chunking/model_doc_fnet/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e52fbe6d5d3ae0e48db58dcb89ed0d614a4e189 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fnet/chunk_13.txt @@ -0,0 +1,2 @@ +Hence, it is highly recommended to use the same maximum +sequence length for fine-tuning and inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fnet/chunk_14.txt b/chunked/content_aware_chunking/model_doc_fnet/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbb09b9a80a62fe7f2870ebdab8fa8d8a40ae504 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fnet/chunk_14.txt @@ -0,0 +1,42 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +FNetConfig +[[autodoc]] FNetConfig +FNetTokenizer +[[autodoc]] FNetTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +FNetTokenizerFast +[[autodoc]] FNetTokenizerFast +FNetModel +[[autodoc]] FNetModel + - forward +FNetForPreTraining +[[autodoc]] FNetForPreTraining + - forward +FNetForMaskedLM +[[autodoc]] FNetForMaskedLM + - forward +FNetForNextSentencePrediction +[[autodoc]] FNetForNextSentencePrediction + - forward +FNetForSequenceClassification +[[autodoc]] FNetForSequenceClassification + - forward +FNetForMultipleChoice +[[autodoc]] FNetForMultipleChoice + - forward +FNetForTokenClassification +[[autodoc]] FNetForTokenClassification + - forward +FNetForQuestionAnswering +[[autodoc]] FNetForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fnet/chunk_6.txt b/chunked/content_aware_chunking/model_doc_fnet/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..215e0f900ef691675121471a74c5de22ec9f00df --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fnet/chunk_6.txt @@ -0,0 +1,3 @@ +Most surprisingly, we find that replacing the self-attention sublayer in a Transformer encoder +with a standard, unparameterized Fourier Transform achieves 92-97% of the accuracy of BERT counterparts on the GLUE +benchmark, but trains 80% faster on GPUs and 70% faster on TPUs at standard 512 input lengths. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fnet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_fnet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..33a92d10c75a039fecaa70a15b7dbb1598cf876c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fnet/chunk_7.txt @@ -0,0 +1,4 @@ +At longer input lengths, +our FNet model is significantly faster: when compared to the "efficient" Transformers on the Long Range Arena +benchmark, FNet matches the accuracy of the most accurate models, while outpacing the fastest models across all +sequence lengths on GPUs (and across relatively shorter lengths on TPUs). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fnet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_fnet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..378711c1cc56256b54e0ebffdfe021dc74b670e4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fnet/chunk_8.txt @@ -0,0 +1,3 @@ +Finally, FNet has a light memory footprint +and is particularly efficient at smaller model sizes; for a fixed speed and accuracy budget, small FNet models +outperform Transformer counterparts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fnet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_fnet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..4437f404c70b3ed51882313dfac05f543c00533f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fnet/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by gchhablani. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_focalnet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_focalnet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c230b2b3927408907a73035c374772f8e648f0d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_focalnet/chunk_10.txt @@ -0,0 +1 @@ +48.5). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_focalnet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_focalnet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..85a8eb48079758087e2d9fa9056ae6b83baafc84 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_focalnet/chunk_11.txt @@ -0,0 +1 @@ +For semantic segmentation with UPerNet, FocalNet base at single-scale outperforms Swin by 2.4, and beats Swin at multi-scale (50.5 v.s. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_focalnet/chunk_12.txt b/chunked/content_aware_chunking/model_doc_focalnet/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..cea20a65246694922d89f7a9ad52f7b4d360c8b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_focalnet/chunk_12.txt @@ -0,0 +1 @@ +49.7). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_focalnet/chunk_13.txt b/chunked/content_aware_chunking/model_doc_focalnet/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..075eb71905333330f09a86081bbad458c6a89292 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_focalnet/chunk_13.txt @@ -0,0 +1 @@ +Using large FocalNet and Mask2former, we achieve 58.5 mIoU for ADE20K semantic segmentation, and 57.9 PQ for COCO Panoptic Segmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_focalnet/chunk_14.txt b/chunked/content_aware_chunking/model_doc_focalnet/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc51072e3b1b92aeea78a86323437811369a31d3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_focalnet/chunk_14.txt @@ -0,0 +1 @@ +Using huge FocalNet and DINO, we achieved 64.3 and 64.4 mAP on COCO minival and test-dev, respectively, establishing new SoTA on top of much larger attention-based models like Swinv2-G and BEIT-3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_focalnet/chunk_15.txt b/chunked/content_aware_chunking/model_doc_focalnet/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_focalnet/chunk_15.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_focalnet/chunk_16.txt b/chunked/content_aware_chunking/model_doc_focalnet/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_focalnet/chunk_16.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_focalnet/chunk_17.txt b/chunked/content_aware_chunking/model_doc_focalnet/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..32d0efa7ef8897c2ccea6d9ab4149b580184997b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_focalnet/chunk_17.txt @@ -0,0 +1,11 @@ +FocalNetConfig +[[autodoc]] FocalNetConfig +FocalNetModel +[[autodoc]] FocalNetModel + - forward +FocalNetForMaskedImageModeling +[[autodoc]] FocalNetForMaskedImageModeling + - forward +FocalNetForImageClassification +[[autodoc]] FocalNetForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_focalnet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_focalnet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3c81ca40e3f1e867d43c6f85473c065e0f7383d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_focalnet/chunk_7.txt @@ -0,0 +1 @@ +After pretrained on ImageNet-22K in 224 resolution, it attains 86.5% and 87.3% top-1 accuracy when finetuned with resolution 224 and 384, respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_focalnet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_focalnet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b18ed44f04336ed6ba6ee49122bb96e227853dcf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_focalnet/chunk_8.txt @@ -0,0 +1 @@ +When transferred to downstream tasks, FocalNets exhibit clear superiority. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_focalnet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_focalnet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..258b23526a22e2760fb93f49fcd411451afeeea2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_focalnet/chunk_9.txt @@ -0,0 +1 @@ +For object detection with Mask R-CNN, FocalNet base trained with 1\times outperforms the Swin counterpart by 2.1 points and already surpasses Swin trained with 3\times schedule (49.0 v.s. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fsmt/chunk_10.txt b/chunked/content_aware_chunking/model_doc_fsmt/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f16a20874db68468d36337353044b26ede99569 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fsmt/chunk_10.txt @@ -0,0 +1,2 @@ +The original code can be found +here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fsmt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_fsmt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..93bc2700357367092c369a2d48b2f07590a1b4a5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fsmt/chunk_11.txt @@ -0,0 +1,3 @@ +Implementation Notes + +FSMT uses source and target vocabulary pairs that aren't combined into one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fsmt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_fsmt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7033a519d22cefb9747fdcf30fa22048ef2661a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fsmt/chunk_12.txt @@ -0,0 +1,2 @@ +It doesn't share embeddings tokens + either. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fsmt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_fsmt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..46c786821843721b735bdfe5879b0ec858dd804d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fsmt/chunk_13.txt @@ -0,0 +1,2 @@ +Its tokenizer is very similar to [XLMTokenizer] and the main model is derived from + [BartModel]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fsmt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_fsmt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..e61cd2cef6285f97c0dc11c1043f3bef731c0960 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fsmt/chunk_14.txt @@ -0,0 +1,14 @@ +FSMTConfig +[[autodoc]] FSMTConfig +FSMTTokenizer +[[autodoc]] FSMTTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +FSMTModel +[[autodoc]] FSMTModel + - forward +FSMTForConditionalGeneration +[[autodoc]] FSMTForConditionalGeneration + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fsmt/chunk_4.txt b/chunked/content_aware_chunking/model_doc_fsmt/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..35fadb5fbd9498fcd914e4a23ed8afdbdc1e0dc9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fsmt/chunk_4.txt @@ -0,0 +1,2 @@ +This year we experiment with different bitext data filtering schemes, +as well as with adding filtered back-translated data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fsmt/chunk_5.txt b/chunked/content_aware_chunking/model_doc_fsmt/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..83654c784d9f38dfcdc40f53b96497ee9c949169 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fsmt/chunk_5.txt @@ -0,0 +1,2 @@ +We also ensemble and fine-tune our models on domain-specific +data, then decode using noisy channel model reranking. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fsmt/chunk_6.txt b/chunked/content_aware_chunking/model_doc_fsmt/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3016a62b0724c6c99a9f54f109208b12abe0201 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fsmt/chunk_6.txt @@ -0,0 +1,2 @@ +Our submissions are ranked first in all four directions of the +human evaluation campaign. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fsmt/chunk_7.txt b/chunked/content_aware_chunking/model_doc_fsmt/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..db35d4a5892fb924b3451a6866ec5364747e61fc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fsmt/chunk_7.txt @@ -0,0 +1 @@ +On En->De, our system significantly outperforms other systems as well as human translations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fsmt/chunk_8.txt b/chunked/content_aware_chunking/model_doc_fsmt/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..219986ae8ae6d720700e6ba25e1911d916fd40c9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fsmt/chunk_8.txt @@ -0,0 +1 @@ +This system improves upon our WMT'18 submission by 4.5 BLEU points. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fsmt/chunk_9.txt b/chunked/content_aware_chunking/model_doc_fsmt/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b646f6106e482b1a5fe9874b29783705d3a2d8a2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fsmt/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by stas. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_10.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b874ab0621a9357b201f479e6d9cf5075a428e9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_10.txt @@ -0,0 +1,3 @@ +Usage tips + +Since Funnel Transformer uses pooling, the sequence length of the hidden states changes after each block of layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_11.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f26f203f0beb6be95d5cb8ae186ab895ca14447 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_11.txt @@ -0,0 +1 @@ +This way, their length is divided by 2, which speeds up the computation of the next hidden states. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_12.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f9a531d696eeb0e04fb62c2b4dd540bc8f1991a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_12.txt @@ -0,0 +1 @@ +The base model therefore has a final sequence length that is a quarter of the original one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_13.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..1450e65c4d5c8bfc460d595886ed39b134295a49 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_13.txt @@ -0,0 +1,2 @@ +This model can be used + directly for tasks that just require a sentence summary (like sequence classification or multiple choice). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_14.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..9653fa3dafb8bd9846d1496c16662ac27f5a4665 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_14.txt @@ -0,0 +1,3 @@ +For other + tasks, the full model is used; this full model has a decoder that upsamples the final hidden states to the same + sequence length as the input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_15.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..1547d19f93fe4ae8efd8b1696f907fca55c5d0c3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_15.txt @@ -0,0 +1 @@ +For tasks such as classification, this is not a problem, but for tasks like masked language modeling or token classification, we need a hidden state with the same sequence length as the original input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_16.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..e163f8ac847252cda390a0cc6cdc00038fb82511 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_16.txt @@ -0,0 +1 @@ +In those cases, the final hidden states are upsampled to the input sequence length and go through two additional layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_17.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..27972aaf511b0a990257b793ed00be366d1e4c63 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_17.txt @@ -0,0 +1 @@ +That's why there are two versions of each checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_18.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..983355356cabc3268e6a20d5ad000b8c2b05bba6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_18.txt @@ -0,0 +1 @@ +The version suffixed with “-base†contains only the three blocks, while the version without that suffix contains the three blocks and the upsampling head with its additional layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_19.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..d80bc5a905d1f2797d7a7ad7fcc99ca7febaa7e7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_19.txt @@ -0,0 +1 @@ +The Funnel Transformer checkpoints are all available with a full version and a base version. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_20.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb30187d35cb815bca595b9572fd88eae40241a5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_20.txt @@ -0,0 +1,4 @@ +The first ones should be + used for [FunnelModel], [FunnelForPreTraining], + [FunnelForMaskedLM], [FunnelForTokenClassification] and + [FunnelForQuestionAnswering]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_21.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..44612f0c8d56805c6fef28db60be4effd69b86ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_21.txt @@ -0,0 +1,3 @@ +The second ones should be used for + [FunnelBaseModel], [FunnelForSequenceClassification] and + [FunnelForMultipleChoice]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_22.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..2148621bcda6902613a276c650aaf4844dc46f52 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_22.txt @@ -0,0 +1,71 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +FunnelConfig +[[autodoc]] FunnelConfig +FunnelTokenizer +[[autodoc]] FunnelTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +FunnelTokenizerFast +[[autodoc]] FunnelTokenizerFast +Funnel specific outputs +[[autodoc]] models.funnel.modeling_funnel.FunnelForPreTrainingOutput +[[autodoc]] models.funnel.modeling_tf_funnel.TFFunnelForPreTrainingOutput + +FunnelBaseModel +[[autodoc]] FunnelBaseModel + - forward +FunnelModel +[[autodoc]] FunnelModel + - forward +FunnelModelForPreTraining +[[autodoc]] FunnelForPreTraining + - forward +FunnelForMaskedLM +[[autodoc]] FunnelForMaskedLM + - forward +FunnelForSequenceClassification +[[autodoc]] FunnelForSequenceClassification + - forward +FunnelForMultipleChoice +[[autodoc]] FunnelForMultipleChoice + - forward +FunnelForTokenClassification +[[autodoc]] FunnelForTokenClassification + - forward +FunnelForQuestionAnswering +[[autodoc]] FunnelForQuestionAnswering + - forward + +TFFunnelBaseModel +[[autodoc]] TFFunnelBaseModel + - call +TFFunnelModel +[[autodoc]] TFFunnelModel + - call +TFFunnelModelForPreTraining +[[autodoc]] TFFunnelForPreTraining + - call +TFFunnelForMaskedLM +[[autodoc]] TFFunnelForMaskedLM + - call +TFFunnelForSequenceClassification +[[autodoc]] TFFunnelForSequenceClassification + - call +TFFunnelForMultipleChoice +[[autodoc]] TFFunnelForMultipleChoice + - call +TFFunnelForTokenClassification +[[autodoc]] TFFunnelForTokenClassification + - call +TFFunnelForQuestionAnswering +[[autodoc]] TFFunnelForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_funnel/chunk_9.txt b/chunked/content_aware_chunking/model_doc_funnel/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_funnel/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_10.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..2eeef4585cfffde0c5ccc67ebd2e03e9671e723f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_10.txt @@ -0,0 +1 @@ +Users should specify the torch_dtype they want, and if they don't it will be torch.float32. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_11.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..28665b877cc7abd6c33c04bed114fb2261876ed9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_11.txt @@ -0,0 +1 @@ +Finetuning the model in float16 is not recommended and known to produce nan, as such the model should be fine-tuned in bfloat16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_12.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..33f01b41c3192d5ef63a04ff8e470602e986f9dc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_12.txt @@ -0,0 +1,20 @@ +Tips: + +To convert the model, you need to clone the original repository using git clone https://github.com/persimmon-ai-labs/adept-inference, then get the checkpoints: + +git clone https://github.com/persimmon-ai-labs/adept-inference +wget path/to/fuyu-8b-model-weights.tar +tar -xvf fuyu-8b-model-weights.tar +python src/transformers/models/fuyu/convert_fuyu_weights_to_hf.py --input_dir /path/to/downloaded/fuyu/weights/ --output_dir /output/path \ + --pt_model_path /path/to/fuyu_8b_release/iter_0001251/mp_rank_00/model_optim_rng.pt + --ada_lib_path /path/to/adept-inference +For the chat model: + +wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar +tar -xvf 8b_base_model_release.tar +Then, model can be loaded via: +py +from transformers import FuyuConfig, FuyuForCausalLM +model_config = FuyuConfig() +model = FuyuForCausalLM(model_config).from_pretrained('/output/path') +Inputs need to be passed through a specific Processor to have the correct formats. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_13.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..23d0ef35b4c4ded6b4636aea6477edbe6e007f39 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_13.txt @@ -0,0 +1 @@ +A processor requires an image_processor and a tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_14.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0d3b0263c35d5fc1b18d810801422ffd3be8c4a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_14.txt @@ -0,0 +1,15 @@ +Hence, inputs can be loaded via: + +from PIL import Image +from transformers import AutoTokenizer +from transformers.models.fuyu.processing_fuyu import FuyuProcessor +from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor +tokenizer = AutoTokenizer.from_pretrained('adept-hf-collab/fuyu-8b') +image_processor = FuyuImageProcessor() +processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer) +text_prompt = "Generate a coco-style caption.\n" +bus_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png" +bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content)) +inputs_to_model = processor(text=text_prompt, images=image_pil) + +This model was contributed by Molbap. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_15.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_15.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_16.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..0522f4b9b0df38fb48e14013f4bdd91503100a50 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_16.txt @@ -0,0 +1 @@ +Fuyu uses a sentencepiece based tokenizer, with a Unigram model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_17.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c343f5896df65be06c7f1380084581ddfa9244c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_17.txt @@ -0,0 +1 @@ +It supports bytefallback, which is only available in tokenizers==0.14.0 for the fast tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_18.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f136186a061869af864a7682b8586eb013b1455 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_18.txt @@ -0,0 +1 @@ +The LlamaTokenizer is used as it is a standard wrapper around sentencepiece. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_19.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..faa994b6a3eac4e010b8534db5f36eb3e9441523 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_19.txt @@ -0,0 +1,13 @@ +The authors suggest to use the following prompt for image captioning: f"Generate a coco-style caption.\\n" + +FuyuConfig +[[autodoc]] FuyuConfig +FuyuForCausalLM +[[autodoc]] FuyuForCausalLM + - forward +FuyuImageProcessor +[[autodoc]] FuyuImageProcessor + - call +FuyuProcessor +[[autodoc]] FuyuProcessor + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_7.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..db818dde1423d315d6094190b8b6a044d02bd9a4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_7.txt @@ -0,0 +1,2 @@ +The Fuyu models were trained using bfloat16, but the original inference uses float16 The checkpoints uploaded on the hub use torch_dtype = 'float16' which will be +used by the AutoModel API to cast the checkpoints from torch.float32 to torch.float16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_8.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..e447e491e18ef028d5d033e89bb3a101c13846ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_8.txt @@ -0,0 +1 @@ +The dtype of the online weights is mostly irrelevant, unless you are using torch_dtype="auto" when initializing a model using model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto"). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_fuyu/chunk_9.txt b/chunked/content_aware_chunking/model_doc_fuyu/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..006237698db1f67b611969f034511e46323e11c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_fuyu/chunk_9.txt @@ -0,0 +1 @@ +The reason is that the model will first be downloaded ( using the dtype of the checkpoints online) then it will be cast to the default dtype of torch (becomes torch.float32). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_10.txt b/chunked/content_aware_chunking/model_doc_git/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..71c787f08eca00dd632f39a986be0bebe60668da --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_10.txt @@ -0,0 +1 @@ +GIT architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_11.txt b/chunked/content_aware_chunking/model_doc_git/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_11.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_12.txt b/chunked/content_aware_chunking/model_doc_git/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_13.txt b/chunked/content_aware_chunking/model_doc_git/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_14.txt b/chunked/content_aware_chunking/model_doc_git/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd99f2d13ac9c6818dd9131ea371f40e1ef0dc75 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_14.txt @@ -0,0 +1,3 @@ +Usage tips + +GIT is implemented in a very similar way to GPT-2, the only difference being that the model is also conditioned on pixel_values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_15.txt b/chunked/content_aware_chunking/model_doc_git/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..8aee7ce3855e9babe448a44f1265bb866429e045 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_15.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GIT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_16.txt b/chunked/content_aware_chunking/model_doc_git/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..011b6a0bbd95902933babb4e4226ff9e1878a95f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_16.txt @@ -0,0 +1 @@ +Demo notebooks regarding inference + fine-tuning GIT on custom data can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_17.txt b/chunked/content_aware_chunking/model_doc_git/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..a89ca786939c1b1d82990f0945c438f40adf05a3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_17.txt @@ -0,0 +1,3 @@ +See also: Causal language modeling task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_18.txt b/chunked/content_aware_chunking/model_doc_git/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_18.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_19.txt b/chunked/content_aware_chunking/model_doc_git/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f3db6f024bdb205ea35c94fb7010f6e1c1ed987 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_19.txt @@ -0,0 +1,17 @@ +GitVisionConfig +[[autodoc]] GitVisionConfig +GitVisionModel +[[autodoc]] GitVisionModel + - forward +GitConfig +[[autodoc]] GitConfig + - all +GitProcessor +[[autodoc]] GitProcessor + - call +GitModel +[[autodoc]] GitModel + - forward +GitForCausalLM +[[autodoc]] GitForCausalLM + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_5.txt b/chunked/content_aware_chunking/model_doc_git/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..4316a30dba618c7e7fd4d731770786534c250df0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_5.txt @@ -0,0 +1 @@ +In GIT, we simplify the architecture as one image encoder and one text decoder under a single language modeling task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_6.txt b/chunked/content_aware_chunking/model_doc_git/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cac1b46dd831d5d83c10d668106ca6ece91a802 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_6.txt @@ -0,0 +1 @@ +We also scale up the pre-training data and the model size to boost the model performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_7.txt b/chunked/content_aware_chunking/model_doc_git/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..5574376fc58527cd5c3aa877cebe1a7084658273 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_7.txt @@ -0,0 +1 @@ +Without bells and whistles, our GIT establishes new state of the arts on 12 challenging benchmarks with a large margin. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_8.txt b/chunked/content_aware_chunking/model_doc_git/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..74c51c2b44628e11e8326753fcc232f2cf561f90 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_8.txt @@ -0,0 +1 @@ +For instance, our model surpasses the human performance for the first time on TextCaps (138.2 vs. 125.5 in CIDEr). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_git/chunk_9.txt b/chunked/content_aware_chunking/model_doc_git/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b265626296aa253b84ec998b4e4089963680fba9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_git/chunk_9.txt @@ -0,0 +1 @@ +Furthermore, we present a new scheme of generation-based image classification and scene text recognition, achieving decent performance on standard benchmarks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_10.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac84dc6d23687953420ee0d185669fc51836add8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_10.txt @@ -0,0 +1 @@ +In addition, the proposed decoder shows better performance than the previously proposed decoders, with considerably less computational complexity. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_11.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac3a86c3ee828fc3a4951669eeef2e595060cea7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_11.txt @@ -0,0 +1 @@ +Furthermore, we improve the depth-specific augmentation method by utilizing an important observation in depth estimation to enhance the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_12.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcf7cd59d52c217995f22a1e831971e666b1f9a9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_12.txt @@ -0,0 +1 @@ +Our network achieves state-of-the-art performance over the challenging depth dataset NYU Depth V2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_13.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2f8095c52cf35a51e4373fc9917ce2d92f5d7b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_13.txt @@ -0,0 +1 @@ +Extensive experiments have been conducted to validate and show the effectiveness of the proposed approach. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_14.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..f703bd225652458c759788fe08efc025d1da4299 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_14.txt @@ -0,0 +1 @@ +Finally, our model shows better generalisation ability and robustness than other comparative models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_15.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6dc1f570105d2eb7f9c2446bf7e5f701054cf176 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_15.txt @@ -0,0 +1 @@ +Summary of the approach. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_16.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_16.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_17.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_17.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_18.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_18.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_19.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d966cfed074426572ff04c4d2c26eeeff2a709e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_19.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GLPN. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_20.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..62afb36f0bb9bb62cec2e6914da097f81b87d229 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_20.txt @@ -0,0 +1 @@ +Demo notebooks for [GLPNForDepthEstimation] can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_21.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c6e40e0a94d32dc2abf50b17d2bb6f1b44f6d88 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_21.txt @@ -0,0 +1,16 @@ +Monocular depth estimation task guide + +GLPNConfig +[[autodoc]] GLPNConfig +GLPNFeatureExtractor +[[autodoc]] GLPNFeatureExtractor + - call +GLPNImageProcessor +[[autodoc]] GLPNImageProcessor + - preprocess +GLPNModel +[[autodoc]] GLPNModel + - forward +GLPNForDepthEstimation +[[autodoc]] GLPNForDepthEstimation + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_7.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..96eff428d070391250bafa18233f3f566e337596 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_7.txt @@ -0,0 +1 @@ +In this paper, we propose a novel structure and training strategy for monocular depth estimation to further improve the prediction accuracy of the network. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_8.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..f91e92b81f02237b8adc417690b60e9b0a768bc8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_8.txt @@ -0,0 +1 @@ +We deploy a hierarchical transformer encoder to capture and convey the global context, and design a lightweight yet powerful decoder to generate an estimated depth map while considering local connectivity. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_glpn/chunk_9.txt b/chunked/content_aware_chunking/model_doc_glpn/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..268bc17134c662a92a1f971d9bef1a89eae91a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_glpn/chunk_9.txt @@ -0,0 +1 @@ +By constructing connected paths between multi-scale local features and the global decoding stream with our proposed selective feature fusion module, the network can integrate both representations and recover fine details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_5.txt b/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..83fa2ca92d4fbae28256ce77768e7c4586105da4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_5.txt @@ -0,0 +1 @@ +This model was contributed by AI Sweden Models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_6.txt b/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf598863cfa2a2ba042f5c852ea3d2ba96159372 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_6.txt @@ -0,0 +1,10 @@ +Usage example +thon + +from transformers import AutoTokenizer, AutoModelForCausalLM +tokenizer = AutoTokenizer.from_pretrained("AI-Sweden-Models/gpt-sw3-356m") +model = AutoModelForCausalLM.from_pretrained("AI-Sweden-Models/gpt-sw3-356m") +input_ids = tokenizer("Träd är fina för att", return_tensors="pt")["input_ids"] +generated_token_ids = model.generate(inputs=input_ids, max_new_tokens=10, do_sample=True)[0] +print(tokenizer.decode(generated_token_ids)) +Träd är fina för att de är färgstarka. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_7.txt b/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..4361cd48549d54bde9c456d2444a40ba190fd972 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_7.txt @@ -0,0 +1,9 @@ +Men ibland är det fint + +Resources + +Text classification task guide +Token classification task guide +Causal language modeling task guide + +The implementation uses the GPT2Model coupled with our GPTSw3Tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_8.txt b/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa914e345b8bcbbeba5cea0ad9cc095856894659 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_8.txt @@ -0,0 +1,2 @@ +Refer to GPT2Model documentation +for API reference and examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_9.txt b/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..c52d297bad59b25e7b8fb194aaf1f299a93d0667 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt-sw3/chunk_9.txt @@ -0,0 +1,5 @@ +Note that sentencepiece is required to use our tokenizer and can be installed with pip install transformers[sentencepiece] or pip install sentencepiece + +GPTSw3Tokenizer +[[autodoc]] GPTSw3Tokenizer + - save_vocabulary \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe2e110662f7703596586b15e7b7401fab8e9f18 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_10.txt @@ -0,0 +1,4 @@ +Usage tips + +GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than + the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..53ffb08f8823aeb2be1e18462430f7ffcbfd17b9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_11.txt @@ -0,0 +1,2 @@ +GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next + token in a sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..387e3dd4d3e56b9c870479538d486179e85c9d33 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_12.txt @@ -0,0 +1,2 @@ +Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be + observed in the run_generation.py example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..11fab4e1f1ed244a078140254ccb43ae664d8d29 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_13.txt @@ -0,0 +1,2 @@ +The model can take the past_key_values (for PyTorch) or past (for TF) as input, which is the previously computed + key/value attention pairs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..b844b1ac286bd162b4e35fdfbd95bc083e40a594 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_14.txt @@ -0,0 +1,2 @@ +Using this (past_key_values or past) value prevents the model from re-computing + pre-computed values in the context of text generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b89d12a0cce9a68413bd59579e00ce4efba4825 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_15.txt @@ -0,0 +1,3 @@ +For PyTorch, see past_key_values argument of the + [GPT2Model.forward] method, or for TF the past argument of the + [TFGPT2Model.call] method for more information on its usage. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..5dd739e9673d3d76e6865c2dddaf46cd97cbdd18 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_16.txt @@ -0,0 +1,2 @@ +Enabling the scale_attn_by_inverse_layer_idx and reorder_and_upcast_attn flags will apply the training stability + improvements from Mistral (for PyTorch only). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_17.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..52d40421fb93f345b439d74c9a157f072eb79aa2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_17.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GPT2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_18.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_18.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_19.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_19.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_20.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..47ee19f46da72bd8d4698d95e95bb0368e2bbc6e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_20.txt @@ -0,0 +1 @@ +A blog on how to Finetune a non-English GPT-2 Model with Hugging Face. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_21.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..9847b4c44426f124c054595b507104228d9c8427 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_21.txt @@ -0,0 +1 @@ +A blog on How to generate text: using different decoding methods for language generation with Transformers with GPT-2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_22.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..142d83994faff59f097875dd54dddf343c79d621 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_22.txt @@ -0,0 +1 @@ +A blog on Training CodeParrot 🦜 from Scratch, a large GPT-2 model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_23.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee895a91f3b0799b622284e171d8a04b6b52d537 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_23.txt @@ -0,0 +1 @@ +A blog on Faster Text Generation with TensorFlow and XLA with GPT-2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_24.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..dca3a023484a29bd8a4a4b021eebd228de4fca99 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_24.txt @@ -0,0 +1 @@ +A blog on How to train a Language Model with Megatron-LM with a GPT-2 model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_25.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6c6d0abc64d9e3489dd231d247aef25d113109d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_25.txt @@ -0,0 +1 @@ +A notebook on how to finetune GPT2 to generate lyrics in the style of your favorite artist. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_26.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d06f544bfcadd33caa11b9818ffcb202eb968689 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_26.txt @@ -0,0 +1,2 @@ +🌎 +A notebook on how to finetune GPT2 to generate tweets in the style of your favorite Twitter user. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_27.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc36d8d0346391c9d13b6f2d3660fa0d712d712c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_27.txt @@ -0,0 +1,2 @@ +🌎 +Causal language modeling chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_28.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..083d32af08ac12201df76ec77b51186c99a92c0b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_28.txt @@ -0,0 +1 @@ +[GPT2LMHeadModel] is supported by this causal language modeling example script, text generation example script, and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_29.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..7daa91d8bb36854774381416013f454296754722 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_29.txt @@ -0,0 +1 @@ +[TFGPT2LMHeadModel] is supported by this causal language modeling example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_30.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..33fc357d598d12843510c09daf484c9e172659e2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_30.txt @@ -0,0 +1 @@ +[FlaxGPT2LMHeadModel] is supported by this causal language modeling example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_31.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff5f7562e51fd3356f93f0ea5d98a7b48baa9044 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_31.txt @@ -0,0 +1,57 @@ +Text classification task guide +Token classification task guide +Causal language modeling task guide + +GPT2Config +[[autodoc]] GPT2Config +GPT2Tokenizer +[[autodoc]] GPT2Tokenizer + - save_vocabulary +GPT2TokenizerFast +[[autodoc]] GPT2TokenizerFast +GPT2 specific outputs +[[autodoc]] models.gpt2.modeling_gpt2.GPT2DoubleHeadsModelOutput +[[autodoc]] models.gpt2.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput + +GPT2Model +[[autodoc]] GPT2Model + - forward +GPT2LMHeadModel +[[autodoc]] GPT2LMHeadModel + - forward +GPT2DoubleHeadsModel +[[autodoc]] GPT2DoubleHeadsModel + - forward +GPT2ForQuestionAnswering +[[autodoc]] GPT2ForQuestionAnswering + - forward +GPT2ForSequenceClassification +[[autodoc]] GPT2ForSequenceClassification + - forward +GPT2ForTokenClassification +[[autodoc]] GPT2ForTokenClassification + - forward + +TFGPT2Model +[[autodoc]] TFGPT2Model + - call +TFGPT2LMHeadModel +[[autodoc]] TFGPT2LMHeadModel + - call +TFGPT2DoubleHeadsModel +[[autodoc]] TFGPT2DoubleHeadsModel + - call +TFGPT2ForSequenceClassification +[[autodoc]] TFGPT2ForSequenceClassification + - call +TFSequenceClassifierOutputWithPast +[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutputWithPast +TFGPT2Tokenizer +[[autodoc]] TFGPT2Tokenizer + +FlaxGPT2Model +[[autodoc]] FlaxGPT2Model + - call +FlaxGPT2LMHeadModel +[[autodoc]] FlaxGPT2LMHeadModel + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..649897ffdb873807ea08b834f0650ff587d9718e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by thomwolf. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_gpt2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt2/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_10.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..3034a5bc1b1ab860f02ff6f3d827eda48e050ac8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_10.txt @@ -0,0 +1,2 @@ +Implementation details +The main differences compared to GPT2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_11.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..869bb7486cda022541c479874facaaaadb971f48 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_11.txt @@ -0,0 +1 @@ +- Added support for Multi-Query Attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_12.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1a59c4940decac6fb7f248468fce0872ad1e527 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_12.txt @@ -0,0 +1 @@ +- Use gelu_pytorch_tanh instead of classic gelu. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_13.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd3c0ffcdf5950e16863535fc980ad307f09d044 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_13.txt @@ -0,0 +1 @@ +- Avoid unnecessary synchronizations (this has since been added to GPT2 in #20061, but wasn't in the reference codebase). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_14.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3632f791b2ae0ca217b9687f5d509ae02986bba --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_14.txt @@ -0,0 +1 @@ +- Use Linear layers instead of Conv1D (good speedup but makes the checkpoints incompatible). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_15.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4f19c40c4328041933a9f18ad7ed99b923eed43 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_15.txt @@ -0,0 +1 @@ +- Merge _attn and _upcast_and_reordered_attn. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_16.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf421d9a91a7624998b3ce625c3f56826ca7b019 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_16.txt @@ -0,0 +1 @@ +Always merge the matmul with scaling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_17.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..75a7edfb1216e8076a9fc6043a28b5ea7ad176d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_17.txt @@ -0,0 +1,2 @@ +Rename reorder_and_upcast_attn->attention_softmax_in_fp32 +- Cache the attention mask value to avoid recreating it every time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_18.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7a68e390f1c81e853594195aa6f4bb6d38281f3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_18.txt @@ -0,0 +1 @@ +- Use jit to fuse the attention fp32 casting, masking, softmax, and scaling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_19.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fd88bdc7c32a5a4c9e1fa1da1ac09f09c27f34f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_19.txt @@ -0,0 +1 @@ +- Combine the attention and causal masks into a single one, pre-computed for the whole model instead of every layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_20.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..91d2a2f3f44a542d44402d1a8cc1b5063f619306 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_20.txt @@ -0,0 +1 @@ +- Merge the key and value caches into one (this changes the format of layer_past/ present, does it risk creating problems?) \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_21.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ecb23c9ebb97e0e109ca69f7b09e2bafa78c7cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_21.txt @@ -0,0 +1 @@ +- Use the memory layout (self.num_heads, 3, self.head_dim) instead of (3, self.num_heads, self.head_dim) for the QKV tensor with MHA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_22.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8e4fc3be074de6dd6cb52ed83581b8d29b9693f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_22.txt @@ -0,0 +1 @@ +(prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original openai-community/gpt2 model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_23.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..332054f91c358cf409d7ff56fe40c220eab6c728 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_23.txt @@ -0,0 +1,3 @@ +You can read more about the optimizations in the original pull request +Combining Starcoder and Flash Attention 2 +First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_24.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..43c8befe42aecbcd19b04130e166207c73a46264 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_24.txt @@ -0,0 +1,2 @@ +pip install -U flash-attn --no-build-isolation +Make also sure that you have a hardware that is compatible with Flash-Attention 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_25.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f54478ededdb4c998599671bfd287599f84cc76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_25.txt @@ -0,0 +1 @@ +Read more about it in the official documentation of flash-attn repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_26.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ef85380a7fd818d955330819ccf432ff686d273 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_26.txt @@ -0,0 +1 @@ +Make also sure to load your model in half-precision (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_27.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6b85589f15fe5ced951abf566bc87a151696bb5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_27.txt @@ -0,0 +1,18 @@ +`torch.float16``) +To load and run a model using Flash Attention 2, refer to the snippet below: +thon + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +device = "cuda" # the device to load the model onto +model = AutoModelForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder", torch_dtype=torch.float16, attn_implementation="flash_attention_2") +tokenizer = AutoTokenizer.from_pretrained("bigcode/gpt_bigcode-santacoder") +prompt = "def hello_world():" +model_inputs = tokenizer([prompt], return_tensors="pt").to(device) +model.to(device) +generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False) +tokenizer.batch_decode(generated_ids)[0] +'def hello_world():\n print("hello world")\n\nif name == "main":\n print("hello world")\n<|endoftext|>' + +Expected speedups +Below is a expected speedup diagram that compares pure inference time between the native implementation in transformers using bigcode/starcoder checkpoint and the Flash Attention 2 version of the model using two different sequence lengths. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_28.txt b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d9284d05e0adf21d30b1d1c2cfce7f943665da0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_bigcode/chunk_28.txt @@ -0,0 +1,14 @@ +GPTBigCodeConfig +[[autodoc]] GPTBigCodeConfig +GPTBigCodeModel +[[autodoc]] GPTBigCodeModel + - forward +GPTBigCodeForCausalLM +[[autodoc]] GPTBigCodeForCausalLM + - forward +GPTBigCodeForSequenceClassification +[[autodoc]] GPTBigCodeForSequenceClassification + - forward +GPTBigCodeForTokenClassification +[[autodoc]] GPTBigCodeForTokenClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_10.txt b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd63de24f73229456a6737f75a76f0aa22623818 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_10.txt @@ -0,0 +1 @@ +torch.float16). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_11.txt b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2ad2731e7da01d912862b73b0f17cb2cbda03d7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_11.txt @@ -0,0 +1,17 @@ +To load and run a model using Flash Attention 2, refer to the snippet below: +thon + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +device = "cuda" # the device to load the model onto +model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-2.7B", torch_dtype=torch.float16, attn_implementation="flash_attention_2") +tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-2.7B") +prompt = "def hello_world():" +model_inputs = tokenizer([prompt], return_tensors="pt").to(device) +model.to(device) +generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True) +tokenizer.batch_decode(generated_ids)[0] +"def hello_world():\n >>> run_script("hello.py")\n >>> exit(0)\n<|endoftext|>" + +Expected speedups +Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using EleutherAI/gpt-neo-2.7B checkpoint and the Flash Attention 2 version of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_12.txt b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..a08d68600752ffb7f2b34d767d694c77523dc413 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_12.txt @@ -0,0 +1,32 @@ +Note that for GPT-Neo it is not possible to train / run on very long context as the max position embeddings is limited to 2048 - but this is applicable to all gpt-neo models and not specific to FA-2 + +Resources + +Text classification task guide +Causal language modeling task guide + +GPTNeoConfig +[[autodoc]] GPTNeoConfig + +GPTNeoModel +[[autodoc]] GPTNeoModel + - forward +GPTNeoForCausalLM +[[autodoc]] GPTNeoForCausalLM + - forward +GPTNeoForQuestionAnswering +[[autodoc]] GPTNeoForQuestionAnswering + - forward +GPTNeoForSequenceClassification +[[autodoc]] GPTNeoForSequenceClassification + - forward +GPTNeoForTokenClassification +[[autodoc]] GPTNeoForTokenClassification + - forward + +FlaxGPTNeoModel +[[autodoc]] FlaxGPTNeoModel + - call +FlaxGPTNeoForCausalLM +[[autodoc]] FlaxGPTNeoForCausalLM + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_5.txt b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..3dbd220b9ea451a58ba3b29c76dbc72fbf91344b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_5.txt @@ -0,0 +1,8 @@ +thon + +from transformers import GPTNeoForCausalLM, GPT2Tokenizer +model = GPTNeoForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B") +tokenizer = GPT2Tokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B") +prompt = ( + "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " + "previously unexplored valley, in the Andes Mountains. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_6.txt b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..dde93ae25f9bd4bbc964047f9c1ddf620316858b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_6.txt @@ -0,0 +1,2 @@ +Even more surprising to the " + "researchers was the fact that the unicorns spoke perfect English." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_7.txt b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8a41658e84a2d0552ae4c4834596bc21438c201 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_7.txt @@ -0,0 +1,12 @@ +) +input_ids = tokenizer(prompt, return_tensors="pt").input_ids +gen_tokens = model.generate( + input_ids, + do_sample=True, + temperature=0.9, + max_length=100, + ) +gen_text = tokenizer.batch_decode(gen_tokens)[0] + +Combining GPT-Neo and Flash Attention 2 +First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature, and make sure your hardware is compatible with Flash-Attention 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_8.txt b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..8449ee3e4ac9c50346af057e8270131c3bc21b72 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_8.txt @@ -0,0 +1 @@ +More details are available here concerning the installation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_9.txt b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..0845dd246318a701ac81701147ad5c3fa09dce06 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neo/chunk_9.txt @@ -0,0 +1 @@ +Make sure as well to load your model in half-precision (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_10.txt b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7850d98407ad3aae425d312e2df82db2d4ac2c89 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_10.txt @@ -0,0 +1,11 @@ +input_ids = tokenizer(prompt, return_tensors="pt").input_ids +gen_tokens = model.generate( + input_ids, + do_sample=True, + temperature=0.9, + max_length=100, + ) +gen_text = tokenizer.batch_decode(gen_tokens)[0] + +Using Flash Attention 2 +Flash Attention 2 is an faster, optimized version of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_11.txt b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..19b296df268eecde857dac9ddc583af7e670c0aa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_11.txt @@ -0,0 +1,2 @@ +Installation +First, check whether your hardware is compatible with Flash Attention 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_12.txt b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..84c311a54146c988c8e82aaf553ec41c221356a6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_12.txt @@ -0,0 +1 @@ +The latest list of compatible hardware can be found in the official documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_13.txt b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff898854e5e7dd8569f851a9b33915004432a6fa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_13.txt @@ -0,0 +1 @@ +If your hardware is not compatible with Flash Attention 2, you can still benefit from attention kernel optimisations through Better Transformer support covered above. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_14.txt b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7aa642fdfc13ddb00fc660b8ed991a62e145d839 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_14.txt @@ -0,0 +1,5 @@ +Next, install the latest version of Flash Attention 2: + +pip install -U flash-attn --no-build-isolation +Usage +To load a model using Flash Attention 2, we can pass the argument attn_implementation="flash_attention_2" to .from_pretrained. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_15.txt b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..9846533b4b22f5676af8f062a2b4be3e9a79e214 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_15.txt @@ -0,0 +1 @@ +We'll also load the model in half-precision (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_16.txt b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..09905cafac3e1f0430beb85eda043f57ef0385c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_16.txt @@ -0,0 +1,9 @@ +torch.float16), since it results in almost no degradation to audio quality but significantly lower memory usage and faster inference: +thon + +from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast + +model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to(device) + +Expected speedups +Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using stockmark/gpt-neox-japanese-1.4b checkpoint and the Flash Attention 2 version of the model using a sequence length of 2048. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_17.txt b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..85d690024f7788a20cb8663862056c1a5f2ab9b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_17.txt @@ -0,0 +1,23 @@ +Resources + +Causal language modeling task guide + +GPTNeoXConfig +[[autodoc]] GPTNeoXConfig +GPTNeoXTokenizerFast +[[autodoc]] GPTNeoXTokenizerFast +GPTNeoXModel +[[autodoc]] GPTNeoXModel + - forward +GPTNeoXForCausalLM +[[autodoc]] GPTNeoXForCausalLM + - forward +GPTNeoXForQuestionAnswering +[[autodoc]] GPTNeoXForQuestionAnswering + - forward +GPTNeoXForSequenceClassification +[[autodoc]] GPTNeoXForSequenceClassification + - forward +GPTNeoXForTokenClassification +[[autodoc]] GPTNeoXForTokenClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_7.txt b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..93590e5d1342b8ea8fa76285ee2422eb1b2456f5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_7.txt @@ -0,0 +1,2 @@ +The new tokenizer allocates +additional tokens to whitespace characters, making the model more suitable for certain tasks like code generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_8.txt b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f330750e2a68af0699e9e41e89849354b161897 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_8.txt @@ -0,0 +1,2 @@ +Usage example +The generate() method can be used to generate text using GPT Neo model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_9.txt b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ba83e78fb643fa3f8ec2eb0e82891cccc474a9e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox/chunk_9.txt @@ -0,0 +1,6 @@ +thon + +from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast +model = GPTNeoXForCausalLM.from_pretrained("EleutherAI/gpt-neox-20b") +tokenizer = GPTNeoXTokenizerFast.from_pretrained("EleutherAI/gpt-neox-20b") +prompt = "GPTNeoX20B is a 20B-parameter autoregressive Transformer model developed by EleutherAI." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_3.txt b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2483cd0c302dae661fe686068ab5d56f67f95fe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_3.txt @@ -0,0 +1 @@ +We are very grateful to tanreinama for open-sourcing this incredibly helpful tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_4.txt b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..8de3c03cfbb0993dbac3d8343c684eedf89246c9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_4.txt @@ -0,0 +1 @@ +Following the recommendations from Google's research on PaLM, we have removed bias parameters from transformer blocks, achieving better model performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_5.txt b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..b96e3b1690f0a1f5f103245825520ff3c713c997 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_5.txt @@ -0,0 +1 @@ +Please refer this article in detail. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_6.txt b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d15757e9cb546e805acc5787b30ca73a62c0927 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_6.txt @@ -0,0 +1 @@ +Development of the model was led by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori from ABEJA, Inc.. For more information on this model-building activity, please refer here (ja). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_7.txt b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cbfe4d2525e6a676a895825d467b4932aed204c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_7.txt @@ -0,0 +1,2 @@ +Usage example +The generate() method can be used to generate text using GPT NeoX Japanese model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_8.txt b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..42fccb956bc4ced501b45b0c880045a9ecf8da46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gpt_neox_japanese/chunk_8.txt @@ -0,0 +1,31 @@ +thon + +from transformers import GPTNeoXJapaneseForCausalLM, GPTNeoXJapaneseTokenizer +model = GPTNeoXJapaneseForCausalLM.from_pretrained("abeja/gpt-neox-japanese-2.7b") +tokenizer = GPTNeoXJapaneseTokenizer.from_pretrained("abeja/gpt-neox-japanese-2.7b") +prompt = "人ã¨AIãŒå”調ã™ã‚‹ãŸã‚ã«ã¯ã€" +input_ids = tokenizer(prompt, return_tensors="pt").input_ids +gen_tokens = model.generate( + input_ids, + do_sample=True, + temperature=0.9, + max_length=100, + ) +gen_text = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)[0] +print(gen_text) +人ã¨AIãŒå”調ã™ã‚‹ãŸã‚ã«ã¯ã€AIã¨äººãŒå…±å­˜ã—ã€AIã‚’æ­£ã—ãç†è§£ã™ã‚‹å¿…è¦ãŒã‚ã‚Šã¾ã™ã€‚ + +Resources + +Causal language modeling task guide + +GPTNeoXJapaneseConfig +[[autodoc]] GPTNeoXJapaneseConfig +GPTNeoXJapaneseTokenizer +[[autodoc]] GPTNeoXJapaneseTokenizer +GPTNeoXJapaneseModel +[[autodoc]] GPTNeoXJapaneseModel + - forward +GPTNeoXJapaneseForCausalLM +[[autodoc]] GPTNeoXJapaneseForCausalLM + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_10.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a9d94056d573a4b80cbbfbbcb08b1af0b4d7421 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_10.txt @@ -0,0 +1 @@ +So it would need at least 4x model size GPU memory, even with mixed precision as gradient updates are in fp32. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_11.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..26f923655656e44326f5639b74d171add55f97ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_11.txt @@ -0,0 +1,2 @@ +This + is not including the activations and data batches, which would again require some more GPU RAM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_12.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..abccf2445f791eb83802c1e2265a99c289a506dc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_12.txt @@ -0,0 +1,2 @@ +So one should explore + solutions such as DeepSpeed, to train/fine-tune the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_13.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cbd24d35a7645ccdc746b3dd21ec02427b2b340 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_13.txt @@ -0,0 +1,2 @@ +Another option is to use the original codebase to + train/fine-tune the model on TPU and then convert the model to Transformers format for inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_14.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b582dad4029a0c017dfbf4efbb011e7f1c0e6d7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_14.txt @@ -0,0 +1,4 @@ +Instructions for + that could be found here + +Although the embedding matrix has a size of 50400, only 50257 entries are used by the GPT-2 tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_15.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..0248bba2bf1ca38da50be2ef172359208649bb50 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_15.txt @@ -0,0 +1,2 @@ +These extra + tokens are added for the sake of efficiency on TPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_16.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..b61dc6ad9ee5f0b9140080d410d161c54a274b02 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_16.txt @@ -0,0 +1,3 @@ +To avoid the mismatch between embedding matrix size and vocab + size, the tokenizer for GPT-J contains 143 extra tokens + <|extratoken_1|> <|extratoken_143|>, so the vocab_size of tokenizer also becomes 50400. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_17.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b53195078401c2b2d308bfe28a9bff42367c89f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_17.txt @@ -0,0 +1,3 @@ +Usage examples +The [~generation.GenerationMixin.generate] method can be used to generate text using GPT-J +model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_18.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..878df9c85fefa62519c2788997cc67def281a2c6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_18.txt @@ -0,0 +1,8 @@ +thon + +from transformers import AutoModelForCausalLM, AutoTokenizer +model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B") +tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") +prompt = ( + "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " + "previously unexplored valley, in the Andes Mountains. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_19.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..dde93ae25f9bd4bbc964047f9c1ddf620316858b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_19.txt @@ -0,0 +1,2 @@ +Even more surprising to the " + "researchers was the fact that the unicorns spoke perfect English." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_20.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..848974669a7ad61f2d4bdeb468523ab4100d7c62 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_20.txt @@ -0,0 +1,21 @@ +) +input_ids = tokenizer(prompt, return_tensors="pt").input_ids +gen_tokens = model.generate( + input_ids, + do_sample=True, + temperature=0.9, + max_length=100, + ) +gen_text = tokenizer.batch_decode(gen_tokens)[0] + +or in float16 precision: +thon + +from transformers import GPTJForCausalLM, AutoTokenizer +import torch +device = "cuda" +model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16).to(device) +tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") +prompt = ( + "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " + "previously unexplored valley, in the Andes Mountains. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_21.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..dde93ae25f9bd4bbc964047f9c1ddf620316858b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_21.txt @@ -0,0 +1,2 @@ +Even more surprising to the " + "researchers was the fact that the unicorns spoke perfect English." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_22.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..a54beaaa4ea6b3199a4f316ad4f9656d63846d00 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_22.txt @@ -0,0 +1,12 @@ +) +input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device) +gen_tokens = model.generate( + input_ids, + do_sample=True, + temperature=0.9, + max_length=100, + ) +gen_text = tokenizer.batch_decode(gen_tokens)[0] + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GPT-J. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_23.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_23.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_24.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_24.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_25.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..215c6a0d0aadaca65547424dcd09afd289df7006 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_25.txt @@ -0,0 +1 @@ +Description of GPT-J. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_26.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..64f3314b143bf8dcdbe5b7304673dcc39d1481f6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_26.txt @@ -0,0 +1 @@ +A blog on how to Deploy GPT-J 6B for inference using Hugging Face Transformers and Amazon SageMaker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_27.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb1819eed29e3d210101da0bdbd659c22819031b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_27.txt @@ -0,0 +1 @@ +A blog on how to Accelerate GPT-J inference with DeepSpeed-Inference on GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_28.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f98983647de92494920fb10102ad7973f2d9ae1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_28.txt @@ -0,0 +1 @@ +A blog post introducing GPT-J-6B: 6B JAX-Based Transformer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_29.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8af5f5773cf2e74e16c59e23526478db678a4cc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_29.txt @@ -0,0 +1,2 @@ +🌎 +A notebook for GPT-J-6B Inference Demo. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_30.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..910dda30d482a90730f05ff871dd4212e80b33d6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_30.txt @@ -0,0 +1,2 @@ +🌎 +Another notebook demonstrating Inference with GPT-J-6B. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_31.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..188d4e3ef86b3c7cb2d8fa95dfb0d6cef3918ced --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_31.txt @@ -0,0 +1 @@ +Causal language modeling chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_32.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..03f8bb3d14d5302f76fab57b8b2817f8916c2397 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_32.txt @@ -0,0 +1 @@ +[GPTJForCausalLM] is supported by this causal language modeling example script, text generation example script, and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_33.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f8da66b9e3df0719392ee035d217851113e6dea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_33.txt @@ -0,0 +1 @@ +[TFGPTJForCausalLM] is supported by this causal language modeling example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_34.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0af90e4d1be2ae9031517ac412250c322e4e38c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_34.txt @@ -0,0 +1 @@ +[FlaxGPTJForCausalLM] is supported by this causal language modeling example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptj/chunk_35.txt b/chunked/content_aware_chunking/model_doc_gptj/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..f23eab4c5594fd828eb1af782af88ff64521204e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptj/chunk_35.txt @@ -0,0 +1,40 @@ +Documentation resources +- Text classification task guide +- Question answering task guide +- Causal language modeling task guide +GPTJConfig +[[autodoc]] GPTJConfig + - all + +GPTJModel +[[autodoc]] GPTJModel + - forward +GPTJForCausalLM +[[autodoc]] GPTJForCausalLM + - forward +GPTJForSequenceClassification +[[autodoc]] GPTJForSequenceClassification + - forward +GPTJForQuestionAnswering +[[autodoc]] GPTJForQuestionAnswering + - forward + +TFGPTJModel +[[autodoc]] TFGPTJModel + - call +TFGPTJForCausalLM +[[autodoc]] TFGPTJForCausalLM + - call +TFGPTJForSequenceClassification +[[autodoc]] TFGPTJForSequenceClassification + - call +TFGPTJForQuestionAnswering +[[autodoc]] TFGPTJForQuestionAnswering + - call + +FlaxGPTJModel +[[autodoc]] FlaxGPTJModel + - call +FlaxGPTJForCausalLM +[[autodoc]] FlaxGPTJForCausalLM + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_10.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c52cc3581e211047b6ec5fd73d77966eadef2274 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_10.txt @@ -0,0 +1 @@ +Spout is pre-trained with random inputs, but you can specify a class of text or an arbitrary vector during fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_11.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab66d7e7b38589f777dabd9fa224a71d4aac7f8a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_11.txt @@ -0,0 +1 @@ +This allows you to indicate the tendency of the generated text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_12.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6bc2454acec28ed844450ff7191967cbc50c368 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_12.txt @@ -0,0 +1 @@ +GPTSAN has a sparse Feed Forward based on Switch-Transformer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_13.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cd7e1374c16d0e9f45873b9b3b3c6d0de0803a1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_13.txt @@ -0,0 +1 @@ +You can also add other layers and train them partially. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_14.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..4faf1014ca85da0fadc9b143597f43cc4f29acad --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_14.txt @@ -0,0 +1 @@ +See the original GPTSAN repository for details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_15.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..e41fa8400246554c1d92529e53c958838d113380 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_15.txt @@ -0,0 +1,2 @@ +Prefix-LM Model +GPTSAN has the structure of the model named Prefix-LM in the T5 paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_16.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..229a057e6b00c38fd1309acf6a1fbc45f6572c09 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_16.txt @@ -0,0 +1,2 @@ +(The original GPTSAN repository calls it hybrid) +In GPTSAN, the Prefix part of Prefix-LM, that is, the input position that can be referenced by both tokens, can be specified with any length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_17.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..1281e872246328f5f491b48c9f37efb90e727319 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_17.txt @@ -0,0 +1 @@ +Arbitrary lengths can also be specified differently for each batch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_18.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..b892539e798f87796cc3da87ee2aa6fc4f4f13f6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_18.txt @@ -0,0 +1 @@ +This length applies to the text entered in prefix_text for the tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_19.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..95cdbebe8867bc32a9ad94eaf3997d263d02e416 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_19.txt @@ -0,0 +1 @@ +The tokenizer returns the mask of the Prefix part of Prefix-LM as token_type_ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_20.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..b349668c4238478f5894a0c49833c21a38cf4e8e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_20.txt @@ -0,0 +1 @@ +The model treats the part where token_type_ids is 1 as a Prefix part, that is, the input can refer to both tokens before and after. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_21.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d85094aaebf74791d90888d6c65554e89e43eda --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_21.txt @@ -0,0 +1,2 @@ +Usage tips +Specifying the Prefix part is done with a mask passed to self-attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_22.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ff823306da1ab89d3d977f258131f1567431a98 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_22.txt @@ -0,0 +1,36 @@ +When token_type_ids=None or all zero, it is equivalent to regular causal mask +for example: + +x_token = tokenizer("アイウエ") +input_ids: | SOT | SEG | ï½± | ï½² | ï½³ | ï½´ | +token_type_ids: | 1 | 0 | 0 | 0 | 0 | 0 | +prefix_lm_mask: +SOT | 1 0 0 0 0 0 | +SEG | 1 1 0 0 0 0 | +ï½± | 1 1 1 0 0 0 | +ï½² | 1 1 1 1 0 0 | +ï½³ | 1 1 1 1 1 0 | +ï½´ | 1 1 1 1 1 1 | +x_token = tokenizer("", prefix_text="アイウエ") +input_ids: | SOT | ï½± | ï½² | ï½³ | ï½´ | SEG | +token_type_ids: | 1 | 1 | 1 | 1 | 1 | 0 | +prefix_lm_mask: +SOT | 1 1 1 1 1 0 | +ï½± | 1 1 1 1 1 0 | +ï½² | 1 1 1 1 1 0 | +ï½³ | 1 1 1 1 1 0 | +ï½´ | 1 1 1 1 1 0 | +SEG | 1 1 1 1 1 1 | +x_token = tokenizer("ウエ", prefix_text="アイ") +input_ids: | SOT | ï½± | ï½² | SEG | ï½³ | ï½´ | +token_type_ids: | 1 | 1 | 1 | 0 | 0 | 0 | +prefix_lm_mask: +SOT | 1 1 1 0 0 0 | +ï½± | 1 1 1 0 0 0 | +ï½² | 1 1 1 0 0 0 | +SEG | 1 1 1 1 0 0 | +ï½³ | 1 1 1 1 1 0 | +ï½´ | 1 1 1 1 1 1 | + +Spout Vector +A Spout Vector is a special vector for controlling text generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_23.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e311500075e3fd6b7d98f36f3a58c8580892f1b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_23.txt @@ -0,0 +1 @@ +This vector is treated as the first embedding in self-attention to bring extraneous attention to the generated tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_24.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f9bc84fd8e05d3dfbb5fda6e95eef40c1f1b562 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_24.txt @@ -0,0 +1 @@ +In the pre-trained model published from Tanrei/GPTSAN-japanese, the Spout Vector is a 128-dimensional vector that passes through 8 fully connected layers in the model and is projected into the space acting as external attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_25.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..4229a24cff2eddd02a9dd3b2bd7f5cd417a30a04 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_25.txt @@ -0,0 +1 @@ +The Spout Vector projected by the fully connected layer is split to be passed to all self-attentions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_26.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce5cdde600b7c687c3d79524dff60bd126e39a5a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_26.txt @@ -0,0 +1,9 @@ +GPTSanJapaneseConfig +[[autodoc]] GPTSanJapaneseConfig +GPTSanJapaneseTokenizer +[[autodoc]] GPTSanJapaneseTokenizer +GPTSanJapaneseModel +[[autodoc]] GPTSanJapaneseModel +GPTSanJapaneseForConditionalGeneration +[[autodoc]] GPTSanJapaneseForConditionalGeneration + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_5.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..de7ff751c684060bbc7de6a2a515be86f9020c62 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_5.txt @@ -0,0 +1,14 @@ +thon + +from transformers import AutoModel, AutoTokenizer +import torch +tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") +model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").cuda() +x_tok = tokenizer("ã¯ã€", prefix_text="織田信長", return_tensors="pt") +torch.manual_seed(0) +gen_tok = model.generate(x_tok.input_ids.cuda(), token_type_ids=x_tok.token_type_ids.cuda(), max_new_tokens=20) +tokenizer.decode(gen_tok[0]) +'織田信長ã¯ã€2004å¹´ã«ã€Žæˆ¦å›½BASARAã€ã®ãŸã‚ã«ã€è±Šè‡£ç§€å‰' + +GPTSAN Features +GPTSAN has some unique features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_6.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..35bca5db44fa76c5f28896755cafa6f256e6b622 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_6.txt @@ -0,0 +1 @@ +It has a model structure of Prefix-LM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_7.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9a9901bfa13bf91e2e96e607836319170e5dc9f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_7.txt @@ -0,0 +1 @@ +It works as a shifted Masked Language Model for Prefix Input tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_8.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd97e5f39378dbd51064014d5fedd0e3b07aa693 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_8.txt @@ -0,0 +1 @@ +Un-prefixed inputs behave like normal generative models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_9.txt b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf6c78681c5093551de2b9d218f7d22b9c133a08 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_gptsan-japanese/chunk_9.txt @@ -0,0 +1 @@ +The Spout vector is a GPTSAN specific input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_graphormer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_graphormer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..879da52ad83294c6757bd1f435d3228a6192db46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_graphormer/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by clefourrier. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_graphormer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_graphormer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_graphormer/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_graphormer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_graphormer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d0f8f391f2e611c7e68af8f57b3bb990eb7a7f2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_graphormer/chunk_12.txt @@ -0,0 +1,2 @@ +Usage tips +This model will not work well on large graphs (more than 100 nodes/edges), as it will make the memory explode. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_graphormer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_graphormer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab41b6c83bb93fc6ba72513adc71efcf3b3d0a3d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_graphormer/chunk_13.txt @@ -0,0 +1 @@ +You can reduce the batch size, increase your RAM, or decrease the UNREACHABLE_NODE_DISTANCE parameter in algos_graphormer.pyx, but it will be hard to go above 700 nodes/edges. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_graphormer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_graphormer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..35f9f306b4290e7b73c8ab25853556b16263de24 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_graphormer/chunk_14.txt @@ -0,0 +1 @@ +This model does not use a tokenizer, but instead a special collator during training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_graphormer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_graphormer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fb45e76713d6bf8fd9cd921d5a63f4a6ada1ba1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_graphormer/chunk_15.txt @@ -0,0 +1,8 @@ +GraphormerConfig +[[autodoc]] GraphormerConfig +GraphormerModel +[[autodoc]] GraphormerModel + - forward +GraphormerForGraphClassification +[[autodoc]] GraphormerForGraphClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_graphormer/chunk_6.txt b/chunked/content_aware_chunking/model_doc_graphormer/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..3958346ec4256b52bcc6a89ab883db69d5d56e85 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_graphormer/chunk_6.txt @@ -0,0 +1 @@ +In this paper, we solve this mystery by presenting Graphormer, which is built upon the standard Transformer architecture, and could attain excellent results on a broad range of graph representation learning tasks, especially on the recent OGB Large-Scale Challenge. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_graphormer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_graphormer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c20cc79530bdb1760b1aae98c58e6553d83d9a2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_graphormer/chunk_7.txt @@ -0,0 +1 @@ +Our key insight to utilizing Transformer in the graph is the necessity of effectively encoding the structural information of a graph into the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_graphormer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_graphormer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b585b3444d10b19cfe6287e99f0c55102aad2454 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_graphormer/chunk_8.txt @@ -0,0 +1 @@ +To this end, we propose several simple yet effective structural encoding methods to help Graphormer better model graph-structured data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_graphormer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_graphormer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..33ba1cdb3fa7b8142420ceb4c7754d782beea3b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_graphormer/chunk_9.txt @@ -0,0 +1 @@ +Besides, we mathematically characterize the expressive power of Graphormer and exhibit that with our ways of encoding the structural information of graphs, many popular GNN variants could be covered as the special cases of Graphormer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_groupvit/chunk_10.txt b/chunked/content_aware_chunking/model_doc_groupvit/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..277b8dff270254a6444d37ef29d42f51fe38ff9e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_groupvit/chunk_10.txt @@ -0,0 +1 @@ +The TensorFlow version was contributed by ariG23498 with the help of Yih-Dar SHIEH, Amy Roberts, and Joao Gante. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_groupvit/chunk_11.txt b/chunked/content_aware_chunking/model_doc_groupvit/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_groupvit/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_groupvit/chunk_12.txt b/chunked/content_aware_chunking/model_doc_groupvit/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e03bca52c66aa94a85a75dbb3c91c56c6619f14 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_groupvit/chunk_12.txt @@ -0,0 +1,3 @@ +Usage tips + +You may specify output_segmentation=True in the forward of GroupViTModel to get the segmentation logits of input texts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_groupvit/chunk_13.txt b/chunked/content_aware_chunking/model_doc_groupvit/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6c486ecdbf1e89c01252bddd3fabbbad7626ed2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_groupvit/chunk_13.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with GroupViT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_groupvit/chunk_14.txt b/chunked/content_aware_chunking/model_doc_groupvit/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..06b213c2cf57131461a958d3cd175ff8df43bdd9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_groupvit/chunk_14.txt @@ -0,0 +1 @@ +The quickest way to get started with GroupViT is by checking the example notebooks (which showcase zero-shot segmentation inference). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_groupvit/chunk_15.txt b/chunked/content_aware_chunking/model_doc_groupvit/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d52a2e9e1f25ccff80c57b1ff30fad2906d1f55 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_groupvit/chunk_15.txt @@ -0,0 +1 @@ +One can also check out the HuggingFace Spaces demo to play with GroupViT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_groupvit/chunk_16.txt b/chunked/content_aware_chunking/model_doc_groupvit/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..45a9db8f02a4c1e4aa8c658201c5992cc2740bd5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_groupvit/chunk_16.txt @@ -0,0 +1,31 @@ +GroupViTConfig +[[autodoc]] GroupViTConfig + - from_text_vision_configs +GroupViTTextConfig +[[autodoc]] GroupViTTextConfig +GroupViTVisionConfig +[[autodoc]] GroupViTVisionConfig + +GroupViTModel +[[autodoc]] GroupViTModel + - forward + - get_text_features + - get_image_features +GroupViTTextModel +[[autodoc]] GroupViTTextModel + - forward +GroupViTVisionModel +[[autodoc]] GroupViTVisionModel + - forward + +TFGroupViTModel +[[autodoc]] TFGroupViTModel + - call + - get_text_features + - get_image_features +TFGroupViTTextModel +[[autodoc]] TFGroupViTTextModel + - call +TFGroupViTVisionModel +[[autodoc]] TFGroupViTVisionModel + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_groupvit/chunk_8.txt b/chunked/content_aware_chunking/model_doc_groupvit/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..782e99d14469414cce9eed77d2355edaf3f0463d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_groupvit/chunk_8.txt @@ -0,0 +1 @@ +It achieves a zero-shot accuracy of 52.3% mIoU on the PASCAL VOC 2012 and 22.4% mIoU on PASCAL Context datasets, and performs competitively to state-of-the-art transfer-learning methods requiring greater levels of supervision. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_groupvit/chunk_9.txt b/chunked/content_aware_chunking/model_doc_groupvit/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..7aea4448392f6588dd907077772aa58a01062575 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_groupvit/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by xvjiarui. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_herbert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_herbert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6faa9dfd84362a30a6f62462fbb32672d3157593 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_herbert/chunk_10.txt @@ -0,0 +1,3 @@ +Finally, we provide an +extensive evaluation, including several standard baselines and recently proposed, multilingual Transformer-based +models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_herbert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_herbert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc219887924fdaabed66ef91ca132e925f8d5733 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_herbert/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by rmroczkowski. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_herbert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_herbert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f16a20874db68468d36337353044b26ede99569 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_herbert/chunk_12.txt @@ -0,0 +1,2 @@ +The original code can be found +here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_herbert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_herbert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..435e87cee77b8b756f03f07512802875490d7c33 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_herbert/chunk_13.txt @@ -0,0 +1,7 @@ +Usage example +thon + +from transformers import HerbertTokenizer, RobertaModel +tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1") +model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1") +encoded_input = tokenizer.encode("Kto ma lepszÄ… sztukÄ™, ma lepszy rzÄ…d – to jasne. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_herbert/chunk_14.txt b/chunked/content_aware_chunking/model_doc_herbert/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..90060612c7edcae62329ac931196051d0cde1290 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_herbert/chunk_14.txt @@ -0,0 +1,9 @@ +", return_tensors="pt") +outputs = model(encoded_input) +HerBERT can also be loaded using AutoTokenizer and AutoModel: +import torch +from transformers import AutoModel, AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1") +model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1") + +Herbert implementation is the same as BERT except for the tokenization method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_herbert/chunk_15.txt b/chunked/content_aware_chunking/model_doc_herbert/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..f501071d10733655a46744d07afa2e0d9b086b74 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_herbert/chunk_15.txt @@ -0,0 +1,2 @@ +Refer to BERT documentation +for API reference and examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_herbert/chunk_16.txt b/chunked/content_aware_chunking/model_doc_herbert/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6727607f8933a6158cdacd64d6e31e2f5fa57e98 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_herbert/chunk_16.txt @@ -0,0 +1,4 @@ +HerbertTokenizer +[[autodoc]] HerbertTokenizer +HerbertTokenizerFast +[[autodoc]] HerbertTokenizerFast \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_herbert/chunk_6.txt b/chunked/content_aware_chunking/model_doc_herbert/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..099b087ba7cb31101e5d9c01601792db5f5280ab --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_herbert/chunk_6.txt @@ -0,0 +1,2 @@ +It consists of a diverse set of tasks, adopted from existing +datasets for named entity recognition, question-answering, textual entailment, and others. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_herbert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_herbert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..6dcb53804fa346a4c9d7c74c9a9a1eacebed5595 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_herbert/chunk_7.txt @@ -0,0 +1,2 @@ +We also introduce a new +sentiment analysis task for the e-commerce domain, named Allegro Reviews (AR). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_herbert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_herbert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..e03f8f94182c8b5fa24a7f54dc7180251a7fb74c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_herbert/chunk_8.txt @@ -0,0 +1,3 @@ +To ensure a common evaluation scheme and +promote models that generalize to different NLU tasks, the benchmark includes datasets from varying domains and +applications. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_herbert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_herbert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..f88d0f197eaa024962bbb0213144416a9e2e2c27 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_herbert/chunk_9.txt @@ -0,0 +1,2 @@ +Additionally, we release HerBERT, a Transformer-based model trained specifically for the Polish language, +which has the best average performance and obtains the best results for three out of nine tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_hubert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_hubert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..b81d5a2b7384b4851a7792b97229208d0a8491ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_hubert/chunk_10.txt @@ -0,0 +1,24 @@ +Resources + +Audio classification task guide +Automatic speech recognition task guide + +HubertConfig +[[autodoc]] HubertConfig + +HubertModel +[[autodoc]] HubertModel + - forward +HubertForCTC +[[autodoc]] HubertForCTC + - forward +HubertForSequenceClassification +[[autodoc]] HubertForSequenceClassification + - forward + +TFHubertModel +[[autodoc]] TFHubertModel + - call +TFHubertForCTC +[[autodoc]] TFHubertForCTC + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_hubert/chunk_5.txt b/chunked/content_aware_chunking/model_doc_hubert/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3051f7734ecf26c1a23cf51749705e7e5f89dae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_hubert/chunk_5.txt @@ -0,0 +1,4 @@ +Starting with a simple k-means +teacher of 100 clusters, and using two iterations of clustering, the HuBERT model either matches or improves upon the +state-of-the-art wav2vec 2.0 performance on the Librispeech (960h) and Libri-light (60,000h) benchmarks with 10min, 1h, +10h, 100h, and 960h fine-tuning subsets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_hubert/chunk_6.txt b/chunked/content_aware_chunking/model_doc_hubert/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cbffa053836016013471bd0b839bb298069e3c4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_hubert/chunk_6.txt @@ -0,0 +1,2 @@ +Using a 1B parameter model, HuBERT shows up to 19% and 13% relative WER +reduction on the more challenging dev-other and test-other evaluation subsets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_hubert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_hubert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8d9a4a80c39655451ecdeff278033b32e2fe584 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_hubert/chunk_7.txt @@ -0,0 +1 @@ +This model was contributed by patrickvonplaten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_hubert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_hubert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdcd6f521539b58c41bf3851bf659ef717d82cf1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_hubert/chunk_8.txt @@ -0,0 +1,3 @@ +Usage tips + +Hubert is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_hubert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_hubert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b959d1a98bc5088fd2f8b235619e80e5c1ce3b86 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_hubert/chunk_9.txt @@ -0,0 +1,2 @@ +Hubert model was fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded + using [Wav2Vec2CTCTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ibert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_ibert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..8307003ae9913cda51f37d5827d5b1319e8275ae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ibert/chunk_10.txt @@ -0,0 +1,2 @@ +The framework has been developed in PyTorch and has +been open-sourced. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ibert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_ibert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4557fbeb080a1e7ee17b67fa06dc19da65cd61b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ibert/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by kssteven. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ibert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_ibert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ibert/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ibert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_ibert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c438cf82f5429c1ddf68109ca8740edacb13fff7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ibert/chunk_13.txt @@ -0,0 +1,28 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +IBertConfig +[[autodoc]] IBertConfig +IBertModel +[[autodoc]] IBertModel + - forward +IBertForMaskedLM +[[autodoc]] IBertForMaskedLM + - forward +IBertForSequenceClassification +[[autodoc]] IBertForSequenceClassification + - forward +IBertForMultipleChoice +[[autodoc]] IBertForMultipleChoice + - forward +IBertForTokenClassification +[[autodoc]] IBertForTokenClassification + - forward +IBertForQuestionAnswering +[[autodoc]] IBertForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ibert/chunk_5.txt b/chunked/content_aware_chunking/model_doc_ibert/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..afe4dd987000b4ae348b3d16e068d314d5db74dc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ibert/chunk_5.txt @@ -0,0 +1,2 @@ +In this work, we propose I-BERT, a novel quantization scheme for Transformer based models that quantizes +the entire inference with integer-only arithmetic. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ibert/chunk_6.txt b/chunked/content_aware_chunking/model_doc_ibert/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b0ec5cffbc93d1219f72d754eab51b1d21b9de0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ibert/chunk_6.txt @@ -0,0 +1,3 @@ +Based on lightweight integer-only approximation methods for +nonlinear operations, e.g., GELU, Softmax, and Layer Normalization, I-BERT performs an end-to-end integer-only BERT +inference without any floating point calculation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ibert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_ibert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..347b148ac27d5ee81d3fc56ead351c47dd9b113c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ibert/chunk_7.txt @@ -0,0 +1,2 @@ +We evaluate our approach on GLUE downstream tasks using +RoBERTa-Base/Large. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ibert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_ibert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2768b881953878b95f34ddba124db55fe7a4442 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ibert/chunk_8.txt @@ -0,0 +1,2 @@ +We show that for both cases, I-BERT achieves similar (and slightly higher) accuracy as compared to +the full-precision baseline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ibert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_ibert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d6778fd22a8b3cb85eba9c5ff69086a4507abff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ibert/chunk_9.txt @@ -0,0 +1,2 @@ +Furthermore, our preliminary implementation of I-BERT shows a speedup of 2.4 - 4.0x for +INT8 inference on a T4 GPU system as compared to FP32 inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_idefics/chunk_10.txt b/chunked/content_aware_chunking/model_doc_idefics/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3882988afd3039f57b53933a81d099a26a8a671 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_idefics/chunk_10.txt @@ -0,0 +1,16 @@ +To train a new IDEFICS model from scratch use the m4 codebase (a link will be provided once it's made public) + +IdeficsConfig +[[autodoc]] IdeficsConfig +IdeficsModel +[[autodoc]] IdeficsModel + - forward +IdeficsForVisionText2Text +[[autodoc]] IdeficsForVisionText2Text + - forward +IdeficsImageProcessor +[[autodoc]] IdeficsImageProcessor + - preprocess +IdeficsProcessor +[[autodoc]] IdeficsProcessor + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_idefics/chunk_5.txt b/chunked/content_aware_chunking/model_doc_idefics/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..abf7e6eb2271507d350b72c07470f01348ed3495 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_idefics/chunk_5.txt @@ -0,0 +1 @@ +We release the code to reproduce the dataset along with the dataset itself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_idefics/chunk_6.txt b/chunked/content_aware_chunking/model_doc_idefics/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..164432a1949b6cc82ef8e7c03551e318e7b5c607 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_idefics/chunk_6.txt @@ -0,0 +1 @@ +This model was contributed by HuggingFaceM4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_idefics/chunk_7.txt b/chunked/content_aware_chunking/model_doc_idefics/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_idefics/chunk_7.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_idefics/chunk_8.txt b/chunked/content_aware_chunking/model_doc_idefics/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..f639641a4a5eb1171c88a8860641bab3c2f39580 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_idefics/chunk_8.txt @@ -0,0 +1 @@ +(TODO: don't have a public link yet). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_idefics/chunk_9.txt b/chunked/content_aware_chunking/model_doc_idefics/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..52415a09b30cb97a0f28d4bbe6c51d053bf47f47 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_idefics/chunk_9.txt @@ -0,0 +1 @@ +IDEFICS modeling code in Transformers is for finetuning and inferencing the pre-trained IDEFICS models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..195e25de2263364f78fe4423f6f2e7215eb86fb0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_11.txt @@ -0,0 +1,4 @@ +Usage tips + +ImageGPT is almost exactly the same as GPT-2, with the exception that a different activation + function is used (namely "quick gelu"), and the layer normalization layers don't mean center the inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b336c6970cc0b747ba366caf16bdb74730a2410 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_12.txt @@ -0,0 +1,2 @@ +ImageGPT + also doesn't have tied input- and output embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..036f1347022fd7e37ce9f10ad9f406c6093b3d5f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_13.txt @@ -0,0 +1,2 @@ +As the time- and memory requirements of the attention mechanism of Transformers scales quadratically in the sequence + length, the authors pre-trained ImageGPT on smaller input resolutions, such as 32x32 and 64x64. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..80d60978b65dfaf8ee9178b61bcc62ae10333f68 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_14.txt @@ -0,0 +1,2 @@ +However, feeding a + sequence of 32x32x3=3072 tokens from 0..255 into a Transformer is still prohibitively large. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_15.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..f213b6ef3196be57adc712017ab677c057e6ffde --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_15.txt @@ -0,0 +1,2 @@ +Therefore, the authors + applied k-means clustering to the (R,G,B) pixel values with k=512. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_16.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd0a2d2468330d373004eb02795c3a9f29929000 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_16.txt @@ -0,0 +1,2 @@ +This way, we only have a 32*32 = 1024-long + sequence, but now of integers in the range 0..511. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_17.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b8ccceb6d63c0bbb0d4a10f4be04c6943b3c5a0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_17.txt @@ -0,0 +1,2 @@ +So we are shrinking the sequence length at the cost of a bigger + embedding matrix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_18.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd4d995dab917edd9a9e68a3a364d018976e70b6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_18.txt @@ -0,0 +1,2 @@ +In other words, the vocabulary size of ImageGPT is 512, + 1 for a special "start of sentence" (SOS) + token, used at the beginning of every sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_19.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8f28e41d5f5ab17eb36d1628a92bfc5205b7d8f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_19.txt @@ -0,0 +1,2 @@ +One can use [ImageGPTImageProcessor] to prepare + images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_20.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..7866613adb66e6be4d0352799b566f765d6b5001 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_20.txt @@ -0,0 +1 @@ +Despite being pre-trained entirely unsupervised (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_21.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..69a17c2f0cac2d9f7d9f35b85c7837f1e527c1d0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_21.txt @@ -0,0 +1,2 @@ +without the use of any labels), ImageGPT produces fairly + performant image features useful for downstream tasks, such as image classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_22.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b9b08d9a3979389c6e68d0a1ba25536e097873e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_22.txt @@ -0,0 +1,3 @@ +The authors showed that the + features in the middle of the network are the most performant, and can be used as-is to train a linear model (such as + a sklearn logistic regression model for example). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_23.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..7511a6201138633ed12d3b3d4a72f66a3d829d2f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_23.txt @@ -0,0 +1 @@ +This is also referred to as "linear probing". \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_24.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..b544b257ba1ce0aa6e9e62c38401c05ae6eefe47 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_24.txt @@ -0,0 +1,3 @@ +Features can be + easily obtained by first forwarding the image through the model, then specifying output_hidden_states=True, and + then average-pool the hidden states at whatever layer you like. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_25.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..b45f0ecf39c43ff33c12191195245b0c92e5d698 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_25.txt @@ -0,0 +1 @@ +Alternatively, one can further fine-tune the entire model on a downstream dataset, similar to BERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_26.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c2cdaeb532dbe55dc84494f551e0772a79aa62a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_26.txt @@ -0,0 +1,2 @@ +For this, you can + use [ImageGPTForImageClassification]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_27.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac9dbccfd826de880c0313ff349d04f251064a7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_27.txt @@ -0,0 +1 @@ +ImageGPT comes in different sizes: there's ImageGPT-small, ImageGPT-medium and ImageGPT-large. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_28.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..b23c6839ef06658b2ba288d0c0392f4dd9377630 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_28.txt @@ -0,0 +1,2 @@ +The authors did also + train an XL variant, which they didn't release. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_29.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..274a15863cfa44e699723a5129031d33ba20eb42 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_29.txt @@ -0,0 +1,12 @@ +The differences in size are summarized in the following table: + +| Model variant | Depths | Hidden sizes | Decoder hidden size | Params (M) | ImageNet-1k Top 1 | +|---|---|---|---|---|---| +| MiT-b0 | [2, 2, 2, 2] | [32, 64, 160, 256] | 256 | 3.7 | 70.5 | +| MiT-b1 | [2, 2, 2, 2] | [64, 128, 320, 512] | 256 | 14.0 | 78.7 | +| MiT-b2 | [3, 4, 6, 3] | [64, 128, 320, 512] | 768 | 25.4 | 81.6 | +| MiT-b3 | [3, 4, 18, 3] | [64, 128, 320, 512] | 768 | 45.2 | 83.1 | +| MiT-b4 | [3, 8, 27, 3] | [64, 128, 320, 512] | 768 | 62.6 | 83.6 | +| MiT-b5 | [3, 6, 40, 3] | [64, 128, 320, 512] | 768 | 82.0 | 83.8 | +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ImageGPT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_30.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a772e100d6cfa388a90324edddeb3d44499b521 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_30.txt @@ -0,0 +1 @@ +Demo notebooks for ImageGPT can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_31.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ded41d03a2413e331d1262785fcbf6839142505 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_31.txt @@ -0,0 +1 @@ +[ImageGPTForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_32.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_32.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_33.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_33.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_imagegpt/chunk_34.txt b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f3d33abdc9768ad51ead8389f8d3cc1a2c73d2f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_imagegpt/chunk_34.txt @@ -0,0 +1,17 @@ +ImageGPTConfig +[[autodoc]] ImageGPTConfig +ImageGPTFeatureExtractor +[[autodoc]] ImageGPTFeatureExtractor + - call +ImageGPTImageProcessor +[[autodoc]] ImageGPTImageProcessor + - preprocess +ImageGPTModel +[[autodoc]] ImageGPTModel + - forward +ImageGPTForCausalImageModeling +[[autodoc]] ImageGPTForCausalImageModeling + - forward +ImageGPTForImageClassification +[[autodoc]] ImageGPTForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_informer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_informer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd239efa357b6b66d24250ed6905454a55081258 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_informer/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by elisim and kashif. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_informer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_informer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_informer/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_informer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_informer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..da320e00afa0cc964a5a7ef63dbffe0646db1773 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_informer/chunk_12.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_informer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_informer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_informer/chunk_13.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_informer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_informer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_informer/chunk_14.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_informer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_informer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..78a80e97e597487584543c46d04ea9bc1dd6c3cd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_informer/chunk_15.txt @@ -0,0 +1,10 @@ +Check out the Informer blog-post in HuggingFace blog: Multivariate Probabilistic Time Series Forecasting with Informer + +InformerConfig +[[autodoc]] InformerConfig +InformerModel +[[autodoc]] InformerModel + - forward +InformerForPrediction +[[autodoc]] InformerForPrediction + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_informer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_informer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef410a909bf8314ad7e6d32480e5161be189b080 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_informer/chunk_8.txt @@ -0,0 +1 @@ +(iii) the generative style decoder, while conceptually simple, predicts the long time-series sequences at one forward operation rather than a step-by-step way, which drastically improves the inference speed of long-sequence predictions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_informer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_informer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..62395e8efc430f73fff268fa6eb8f7dc9b0f9164 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_informer/chunk_9.txt @@ -0,0 +1 @@ +Extensive experiments on four large-scale datasets demonstrate that Informer significantly outperforms existing methods and provides a new solution to the LSTF problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_instructblip/chunk_10.txt b/chunked/content_aware_chunking/model_doc_instructblip/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..ceb052887d8aa2e0582a92a69b54da484ea1be68 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_instructblip/chunk_10.txt @@ -0,0 +1 @@ +Furthermore, we qualitatively demonstrate the advantages of InstructBLIP over concurrent multimodal models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_instructblip/chunk_11.txt b/chunked/content_aware_chunking/model_doc_instructblip/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c076df3441481b301281f4cc646645b902ba410a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_instructblip/chunk_11.txt @@ -0,0 +1 @@ +InstructBLIP architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_instructblip/chunk_12.txt b/chunked/content_aware_chunking/model_doc_instructblip/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_instructblip/chunk_12.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_instructblip/chunk_13.txt b/chunked/content_aware_chunking/model_doc_instructblip/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_instructblip/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_instructblip/chunk_14.txt b/chunked/content_aware_chunking/model_doc_instructblip/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_instructblip/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_instructblip/chunk_15.txt b/chunked/content_aware_chunking/model_doc_instructblip/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..ead3cf75e6f7061906d677eb9b14849849d3ffbf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_instructblip/chunk_15.txt @@ -0,0 +1,2 @@ +Usage tips +InstructBLIP uses the same architecture as BLIP-2 with a tiny but important difference: it also feeds the text prompt (instruction) to the Q-Former. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_instructblip/chunk_16.txt b/chunked/content_aware_chunking/model_doc_instructblip/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..72411760507068c4a9dc3f38429cbdcac499a60d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_instructblip/chunk_16.txt @@ -0,0 +1,19 @@ +InstructBlipConfig +[[autodoc]] InstructBlipConfig + - from_vision_qformer_text_configs +InstructBlipVisionConfig +[[autodoc]] InstructBlipVisionConfig +InstructBlipQFormerConfig +[[autodoc]] InstructBlipQFormerConfig +InstructBlipProcessor +[[autodoc]] InstructBlipProcessor +InstructBlipVisionModel +[[autodoc]] InstructBlipVisionModel + - forward +InstructBlipQFormerModel +[[autodoc]] InstructBlipQFormerModel + - forward +InstructBlipForConditionalGeneration +[[autodoc]] InstructBlipForConditionalGeneration + - forward + - generate \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_instructblip/chunk_6.txt b/chunked/content_aware_chunking/model_doc_instructblip/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..6eefcccdfe66644cdc2e9c577c2255b1582c46c5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_instructblip/chunk_6.txt @@ -0,0 +1 @@ +We gather a wide variety of 26 publicly available datasets, transform them into instruction tuning format and categorize them into two clusters for held-in instruction tuning and held-out zero-shot evaluation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_instructblip/chunk_7.txt b/chunked/content_aware_chunking/model_doc_instructblip/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e3deae6be3354cdf64b87112956f736a6c77f8c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_instructblip/chunk_7.txt @@ -0,0 +1 @@ +Additionally, we introduce instruction-aware visual feature extraction, a crucial method that enables the model to extract informative features tailored to the given instruction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_instructblip/chunk_8.txt b/chunked/content_aware_chunking/model_doc_instructblip/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..323ca569fd8f522c4cc6e6e8e5ea34be9a6f1638 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_instructblip/chunk_8.txt @@ -0,0 +1 @@ +The resulting InstructBLIP models achieve state-of-the-art zero-shot performance across all 13 held-out datasets, substantially outperforming BLIP-2 and the larger Flamingo. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_instructblip/chunk_9.txt b/chunked/content_aware_chunking/model_doc_instructblip/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a6ee0c53332d4a8bf82de39d001a53a0e07c7e6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_instructblip/chunk_9.txt @@ -0,0 +1 @@ +Our models also lead to state-of-the-art performance when finetuned on individual downstream tasks (e.g., 90.7% accuracy on ScienceQA IMG). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_10.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..2028041bb32151faa4e9285bb1784afe0d358b6f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_10.txt @@ -0,0 +1 @@ +Next, the first (also called top_prior) prior attends to the last hidden states extracted from the lyrics encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_11.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..68ced71c609036428f8704920dc65858f59d8d68 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_11.txt @@ -0,0 +1 @@ +The priors are linked to the previous priors respectively via an AudioConditioner module. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_12.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fb018045b3befbcc64f297c308f6abbebef07f5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_12.txt @@ -0,0 +1 @@ +TheAudioConditioner upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_13.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..418fbdc195cbc52d0f6bc39bcf04bc13dfdad91d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_13.txt @@ -0,0 +1 @@ +The metadata such as artist, genre and timing are passed to each prior, in the form of a start token and positional embedding for the timing data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_14.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..6844c03e457b2ad9c69388523ab205f2995f5a83 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_14.txt @@ -0,0 +1 @@ +The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_15.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab9617ef2517f302473dd89cd52cc9b95ae1cba6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_15.txt @@ -0,0 +1 @@ +This model was contributed by Arthur Zucker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_16.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_16.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_17.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..306da2f1915e226b48bc482826199e7ed2028f12 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_17.txt @@ -0,0 +1,3 @@ +Usage tips + +This model only supports inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_18.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..acdb553864a5160a021349b5fe68ef5b6712e794 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_18.txt @@ -0,0 +1 @@ +This is for a few reasons, mostly because it requires a crazy amount of memory to train. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_19.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b163ec4b56d81202d06ff5c76b6eb0348b2f1cc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_19.txt @@ -0,0 +1 @@ +Feel free to open a PR and add what's missing to have a full integration with the hugging face trainer! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_20.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..51b0fa55288fa3a8d733d990239a55029fbd20e5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_20.txt @@ -0,0 +1 @@ +This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_21.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3aada9b6733324832d87b674b1ae6ec0ccd5a82 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_21.txt @@ -0,0 +1 @@ +In order automaticallay handle the device on which the model should execute, use accelerate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_22.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..aae4bb30c7fcc8483ebb42a7f03486f03e0b679a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_22.txt @@ -0,0 +1 @@ +Contrary to the paper, the order of the priors goes from 0 to 1 as it felt more intuitive : we sample starting from 0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_23.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a8c09e84282f7dce3d835915e52b980ef558439 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_23.txt @@ -0,0 +1 @@ +Primed sampling (conditioning the sampling on raw audio) requires more memory than ancestral sampling and should be used with fp16 set to True. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_24.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab9617ef2517f302473dd89cd52cc9b95ae1cba6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_24.txt @@ -0,0 +1 @@ +This model was contributed by Arthur Zucker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_25.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_25.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_26.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd3471ff83aa868232854a7e097ca329914404d3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_26.txt @@ -0,0 +1,25 @@ +JukeboxConfig +[[autodoc]] JukeboxConfig +JukeboxPriorConfig +[[autodoc]] JukeboxPriorConfig +JukeboxVQVAEConfig +[[autodoc]] JukeboxVQVAEConfig +JukeboxTokenizer +[[autodoc]] JukeboxTokenizer + - save_vocabulary +JukeboxModel +[[autodoc]] JukeboxModel + - ancestral_sample + - primed_sample + - continue_sample + - upsample + - _sample +JukeboxPrior +[[autodoc]] JukeboxPrior + - sample + - forward +JukeboxVQVAE +[[autodoc]] JukeboxVQVAE + - forward + - encode + - decode \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_7.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd7de0bc9dd8ff8b1b6b3ed01c57fe402e106fa4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_7.txt @@ -0,0 +1 @@ +As shown on the following figure, Jukebox is made of 3 priors which are decoder only models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_8.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a14b2b5d915828c0a1ecf3ef59d4ed9df24dac98 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_8.txt @@ -0,0 +1 @@ +They follow the architecture described in Generating Long Sequences with Sparse Transformers, modified to support longer context length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_jukebox/chunk_9.txt b/chunked/content_aware_chunking/model_doc_jukebox/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..e834303e7d57613f8538708b36fdf276091439d3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_jukebox/chunk_9.txt @@ -0,0 +1 @@ +First, a autoencoder is used to encode the text lyrics. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b449718bcd93355b6134978742897fffb50a5f4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_10.txt @@ -0,0 +1 @@ +Code and pretrained models are available at https://aka.ms/kosmos-2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4e99b51da5583723b72b8e09c7ae9d2f6f750f0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_11.txt @@ -0,0 +1 @@ +Overview of tasks that KOSMOS-2 can handle. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_12.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b6da15c686383988742ca5b46fb3bd5e63b002c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_13.txt @@ -0,0 +1,25 @@ +Example +thon + +from PIL import Image +import requests +from transformers import AutoProcessor, Kosmos2ForConditionalGeneration +model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224") +processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224") +url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg" +image = Image.open(requests.get(url, stream=True).raw) +prompt = " An image of" +inputs = processor(text=prompt, images=image, return_tensors="pt") +generated_ids = model.generate( + pixel_values=inputs["pixel_values"], + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + image_embeds=None, + image_embeds_position_mask=inputs["image_embeds_position_mask"], + use_cache=True, + max_new_tokens=64, + ) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] +processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False) +processed_text +' An image of a snowman warming himself by a fire.' \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..42408ce8f85a9776b1974cabc08dca9383b5d320 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_14.txt @@ -0,0 +1,3 @@ +caption, entities = processor.post_process_generation(generated_text) +caption +'An image of a snowman warming himself by a fire.' \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..05036e9318419caa4d61a90a0aa89a031329d14b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_15.txt @@ -0,0 +1,4 @@ +entities +[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])] + +This model was contributed by Yih-Dar SHIEH. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_16.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_17.txt b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..dae121945d4ef462e21030bafc987ddc4ea93495 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_17.txt @@ -0,0 +1,12 @@ +Kosmos2Config +[[autodoc]] Kosmos2Config +Kosmos2ImageProcessor +Kosmos2Processor +[[autodoc]] Kosmos2Processor + - call +Kosmos2Model +[[autodoc]] Kosmos2Model + - forward +Kosmos2ForConditionalGeneration +[[autodoc]] Kosmos2ForConditionalGeneration + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a450f6e9e10c79e2c9411f961366e67a7fac64bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_8.txt @@ -0,0 +1 @@ +We evaluate Kosmos-2 on a wide range of tasks, including (i) multimodal grounding, such as referring expression comprehension, and phrase grounding, (ii) multimodal referring, such as referring expression generation, (iii) perception-language tasks, and (iv) language understanding and generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b65354bdc1b9b01312d5520eca3aed3a7bc07f78 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_kosmos-2/chunk_9.txt @@ -0,0 +1 @@ +This work lays out the foundation for the development of Embodiment AI and sheds light on the big convergence of language, multimodal perception, action, and world modeling, which is a key step toward artificial general intelligence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_10.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c17ccb5c78e5a97b71069201e87523c85b62c31b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_10.txt @@ -0,0 +1,3 @@ +It achieves new state-of-the-art results in several downstream tasks, including form +understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image classification +(from 93.07 to 94.42). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_11.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6203b9e020d8454059d893071bb589cc4811613 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_11.txt @@ -0,0 +1,4 @@ +Usage tips + +In addition to input_ids, [~transformers.LayoutLMModel.forward] also expects the input bbox, which are + the bounding boxes (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_12.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf8934c4efb247984fc3e12e02240aa089926d1f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_12.txt @@ -0,0 +1 @@ +2D-positions) of the input tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_13.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..a27c29fce78d28bfb0988b999c3555faa4252b80 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_13.txt @@ -0,0 +1,2 @@ +These can be obtained using an external OCR engine such + as Google's Tesseract (there's a Python wrapper available). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_14.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c29504975c8c8f2bf9701a89055ce38bd39cd5a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_14.txt @@ -0,0 +1,3 @@ +Each bounding box should be in (x0, y0, x1, y1) format, where + (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, y1) represents the + position of the lower right corner. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_15.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4ebeba670d33c9fdaec64a27aff6c3aa367d7cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_15.txt @@ -0,0 +1,2 @@ +Note that one first needs to normalize the bounding boxes to be on a 0-1000 + scale. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_16.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..df0b2510ec6976f2f6d85cff183e7e3de760d47c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_16.txt @@ -0,0 +1,12 @@ +To normalize, you can use the following function: + +python +def normalize_bbox(bbox, width, height): + return [ + int(1000 * (bbox[0] / width)), + int(1000 * (bbox[1] / height)), + int(1000 * (bbox[2] / width)), + int(1000 * (bbox[3] / height)), + ] +Here, width and height correspond to the width and height of the original document in which the token +occurs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_17.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a2104abafb894f9ed7611628228990a004e59f7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_17.txt @@ -0,0 +1,4 @@ +Those can be obtained using the Python Image Library (PIL) library for example, as follows: +thon +from PIL import Image +Document can be a png, jpg, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_18.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..c257b88eaa58cde21e85fca4ccd24371277af3b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_18.txt @@ -0,0 +1 @@ +PDFs must be converted to images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_19.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..202c81d87fcf921b8901f775f0a5f24b0ac735ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_19.txt @@ -0,0 +1,5 @@ +image = Image.open(name_of_your_document).convert("RGB") +width, height = image.size + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_20.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_20.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_21.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_21.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_22.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5494ef88c6e684529dd7af866caebe6eeaa734b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_22.txt @@ -0,0 +1,3 @@ +A blog post on fine-tuning + LayoutLM for document-understanding using Keras & Hugging Face + Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_23.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc8f2c9c3cd18b36432a1a44ede35e23a6b5e651 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_23.txt @@ -0,0 +1 @@ +A blog post on how to fine-tune LayoutLM for document-understanding using only Hugging Face Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_24.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdd42113a5debe1bff4c70c761c374afbb0ea1de --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_24.txt @@ -0,0 +1 @@ +A notebook on how to fine-tune LayoutLM on the FUNSD dataset with image embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_25.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..68d0f6085175184bf427817b906b7a74143ce82d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_25.txt @@ -0,0 +1,3 @@ +See also: Document question answering task guide + +A notebook on how to fine-tune LayoutLM for sequence classification on the RVL-CDIP dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_26.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..37777e1e83fbeb5b8f9f39f620f8e5fa388a889c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_26.txt @@ -0,0 +1,3 @@ +Text classification task guide + +A notebook on how to fine-tune LayoutLM for token classification on the FUNSD dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_27.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1661e1129da74b06b06714ccc5188bf08cedc47 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_27.txt @@ -0,0 +1,7 @@ +Token classification task guide + +Other resources +- Masked language modeling task guide +🚀 Deploy + +A blog post on how to Deploy LayoutLM with Hugging Face Inference Endpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_28.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d0140a668e3f527633578f0ad473d25dc83375a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_28.txt @@ -0,0 +1,28 @@ +LayoutLMConfig +[[autodoc]] LayoutLMConfig +LayoutLMTokenizer +[[autodoc]] LayoutLMTokenizer +LayoutLMTokenizerFast +[[autodoc]] LayoutLMTokenizerFast + +LayoutLMModel +[[autodoc]] LayoutLMModel +LayoutLMForMaskedLM +[[autodoc]] LayoutLMForMaskedLM +LayoutLMForSequenceClassification +[[autodoc]] LayoutLMForSequenceClassification +LayoutLMForTokenClassification +[[autodoc]] LayoutLMForTokenClassification +LayoutLMForQuestionAnswering +[[autodoc]] LayoutLMForQuestionAnswering + +TFLayoutLMModel +[[autodoc]] TFLayoutLMModel +TFLayoutLMForMaskedLM +[[autodoc]] TFLayoutLMForMaskedLM +TFLayoutLMForSequenceClassification +[[autodoc]] TFLayoutLMForSequenceClassification +TFLayoutLMForTokenClassification +[[autodoc]] TFLayoutLMForTokenClassification +TFLayoutLMForQuestionAnswering +[[autodoc]] TFLayoutLMForQuestionAnswering \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlm/chunk_9.txt b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..147c51d0fada6433a5e466fb608c565d9e6890ca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlm/chunk_9.txt @@ -0,0 +1,2 @@ +To the best of our knowledge, this is the first time that text and layout are jointly learned in a single framework for +document-level pretraining. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_32.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f3b0372d6ebcc4f49319ad56e82ca2b3fb183f5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_32.txt @@ -0,0 +1,6 @@ +Those can be obtained using the Python Image Library (PIL) library for example, as +follows: +thon +from PIL import Image +image = Image.open( + "name_of_your_document - can be a png, jpg, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_33.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf147327975c499f5e080521221decdcd523135d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_33.txt @@ -0,0 +1 @@ +of your documents (PDFs must be converted to images)." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_34.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..8532f6daa24fc774d15cf76f90039e1d676be0fb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_34.txt @@ -0,0 +1,5 @@ +) +width, height = image.size + +However, this model includes a brand new [~transformers.LayoutLMv2Processor] which can be used to directly +prepare data for the model (including applying OCR under the hood). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_35.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd94bbd679ed0b28ddc54fb7574a55c6c5e8dd14 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_35.txt @@ -0,0 +1,2 @@ +More information can be found in the "Usage" +section below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_36.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..2817630bc85e1ccfc2cc553224b7e1db24186adf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_36.txt @@ -0,0 +1,3 @@ +Internally, [~transformers.LayoutLMv2Model] will send the image input through its visual backbone to + obtain a lower-resolution feature map, whose shape is equal to the image_feature_pool_shape attribute of + [~transformers.LayoutLMv2Config]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_37.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..c39efdbb9a94a3a00dbde790a0942967a95f13b9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_37.txt @@ -0,0 +1 @@ +This feature map is then flattened to obtain a sequence of image tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_38.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..0cef9c83f6b28e518b088f30d93cf23afc071f67 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_38.txt @@ -0,0 +1,2 @@ +As + the size of the feature map is 7x7 by default, one obtains 49 image tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_39.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f366b7393368e139a842f0e7f19eb9a16ffd96c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_39.txt @@ -0,0 +1,2 @@ +These are then concatenated with the text + tokens, and send through the Transformer encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_40.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..b010f30a20ebc758a9ffe88280a67208279c93c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_40.txt @@ -0,0 +1,2 @@ +This means that the last hidden states of the model will have a + length of 512 + 49 = 561, if you pad the text tokens up to the max length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_41.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f00b58e23d91b7703904c7d6c5f74935faaec63 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_41.txt @@ -0,0 +1,3 @@ +More generally, the last hidden states + will have a shape of seq_length + image_feature_pool_shape[0] * + config.image_feature_pool_shape[1]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_42.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa1e1ce02ad93b47103fe86ecc38abd61b919c95 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_42.txt @@ -0,0 +1,2 @@ +When calling [~transformers.LayoutLMv2Model.from_pretrained], a warning will be printed with a long list of + parameter names that are not initialized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_43.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c6a513a39d50d549e170849fba564d759a92b9d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_43.txt @@ -0,0 +1,2 @@ +This is not a problem, as these parameters are batch normalization + statistics, which are going to have values when fine-tuning on a custom dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_44.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..189e930ca3a0977c2d6edd8b6fd3941a356fb64e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_44.txt @@ -0,0 +1,2 @@ +If you want to train the model in a distributed environment, make sure to call [synchronize_batch_norm] on the + model in order to properly synchronize the batch normalization layers of the visual backbone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_45.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..148f14cf61514205198502d8a3febe93269ac44a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_45.txt @@ -0,0 +1 @@ +In addition, there's LayoutXLM, which is a multilingual version of LayoutLMv2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_46.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..372c70bb86a288aef8fd6acfba4d10e8fa1493c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_46.txt @@ -0,0 +1,2 @@ +More information can be found on +LayoutXLM's documentation page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_47.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..16443609e4d835942b51b57179f934eb062ab206 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_47.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLMv2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_48.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_48.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_49.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_49.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_50.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..3400d56f6aa24e36413817ce3fd5f80073b47a8a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_50.txt @@ -0,0 +1 @@ +A notebook on how to finetune LayoutLMv2 for text-classification on RVL-CDIP dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_51.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d79fd8fa295bd6b691bd44836e109019a1efe40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_51.txt @@ -0,0 +1,3 @@ +See also: Text classification task guide + +A notebook on how to finetune LayoutLMv2 for question-answering on DocVQA dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_52.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..552118b6aabaa16e8edad5a29185a1a95a14bdbd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_52.txt @@ -0,0 +1,4 @@ +See also: Question answering task guide +See also: Document question answering task guide + +A notebook on how to finetune LayoutLMv2 for token-classification on CORD dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_53.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..d962f311717e4f264523181a0e1e7e8d83f239fb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_53.txt @@ -0,0 +1 @@ +A notebook on how to finetune LayoutLMv2 for token-classification on FUNSD dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_54.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..221c6325cdb7d84e32d7aeebd64eeb1f7c4c8360 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_54.txt @@ -0,0 +1,6 @@ +See also: Token classification task guide + +Usage: LayoutLMv2Processor +The easiest way to prepare data for the model is to use [LayoutLMv2Processor], which internally +combines a image processor ([LayoutLMv2ImageProcessor]) and a tokenizer +([LayoutLMv2Tokenizer] or [LayoutLMv2TokenizerFast]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_55.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d4a14b8f50414cc4c27ed5ef0fb27d6d8ee4301 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_55.txt @@ -0,0 +1,2 @@ +The image processor +handles the image modality, while the tokenizer handles the text modality. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_56.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d247aafddd5fc5ca867fdf5e8f4cdb29f515f75 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_56.txt @@ -0,0 +1,2 @@ +A processor combines both, which is ideal +for a multi-modal model like LayoutLMv2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_57.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f7655aafe2bbb9050f85aabc8e477c11aab3645 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_57.txt @@ -0,0 +1,2 @@ +Note that you can still use both separately, if you only want to handle one +modality. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_58.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..af2140beaec94165ec430891b6a199b19340f46c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_58.txt @@ -0,0 +1,8 @@ +thon +from transformers import LayoutLMv2ImageProcessor, LayoutLMv2TokenizerFast, LayoutLMv2Processor +image_processor = LayoutLMv2ImageProcessor() # apply_ocr is set to True by default +tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased") +processor = LayoutLMv2Processor(image_processor, tokenizer) + +In short, one can provide a document image (and possibly additional data) to [LayoutLMv2Processor], +and it will create the inputs expected by the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_59.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..935c64c8ab6049430e36020f7ae3a9d7bc59c0bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_59.txt @@ -0,0 +1,3 @@ +Internally, the processor first uses +[LayoutLMv2ImageProcessor] to apply OCR on the image to get a list of words and normalized +bounding boxes, as well to resize the image to a given size in order to get the image input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_60.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..6911a850b8b62de1c141bc9de52317d6406df504 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_60.txt @@ -0,0 +1,4 @@ +The words and +normalized bounding boxes are then provided to [LayoutLMv2Tokenizer] or +[LayoutLMv2TokenizerFast], which converts them to token-level input_ids, +attention_mask, token_type_ids, bbox. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_61.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8d910efbe9f772424dc8f5b29f5c495496538fe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_61.txt @@ -0,0 +1,2 @@ +Optionally, one can provide word labels to the processor, +which are turned into token-level labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_62.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c8f63140236453ab4d54b86d55f511a498294ec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_62.txt @@ -0,0 +1,2 @@ +[LayoutLMv2Processor] uses PyTesseract, a Python +wrapper around Google's Tesseract OCR engine, under the hood. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_63.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd03b2332c99388cebed3fcd9cb0323ce75771e8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_63.txt @@ -0,0 +1,2 @@ +Note that you can still use your own OCR engine of +choice, and provide the words and normalized boxes yourself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_64.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..4328b1d482d166acca69dff60457b18ebf735d75 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_64.txt @@ -0,0 +1,2 @@ +This requires initializing +[LayoutLMv2ImageProcessor] with apply_ocr set to False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_65.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b5541f8bd7f5eb0180a23f5163bcdef407e3570 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_65.txt @@ -0,0 +1 @@ +In total, there are 5 use cases that are supported by the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_66.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..e615b20e64512fd039f3ae01c2c2b77df0e22068 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_66.txt @@ -0,0 +1 @@ +Below, we list them all. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_67.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..b096c20bea5ea96d175a4179b378b9e51468de66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_67.txt @@ -0,0 +1,2 @@ +Note that each of these +use cases work for both batched and non-batched inputs (we illustrate them for non-batched inputs). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_68.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbd50d168cc18584449c013da3f763c3dd79f414 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_68.txt @@ -0,0 +1,4 @@ +Use case 1: document image classification (training, inference) + token classification (inference), apply_ocr = +True +This is the simplest case, in which the processor (actually the image processor) will perform OCR on the image to get +the words and normalized bounding boxes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_69.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d0b43ccfcb31412e4a6cfd8280fac6af6c20fb2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_69.txt @@ -0,0 +1,6 @@ +thon +from transformers import LayoutLMv2Processor +from PIL import Image +processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased") +image = Image.open( + "name_of_your_document - can be a png, jpg, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_70.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf147327975c499f5e080521221decdcd523135d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_70.txt @@ -0,0 +1 @@ +of your documents (PDFs must be converted to images)." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_71.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..84a710c6d4a79dcb0db28b377f06fd0df284bb38 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_71.txt @@ -0,0 +1,10 @@ +).convert("RGB") +encoding = processor( + image, return_tensors="pt" +) # you can also add all tokenizer parameters here such as padding, truncation +print(encoding.keys()) +dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image']) + +Use case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False +In case one wants to do OCR themselves, one can initialize the image processor with apply_ocr set to +False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_72.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..36488c0df85840c52edcff48ce7093f1b36ed660 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_72.txt @@ -0,0 +1,2 @@ +In that case, one should provide the words and corresponding (normalized) bounding boxes themselves to +the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_73.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..da4cd8767c20149e4341a1231eb8393c063d843f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_73.txt @@ -0,0 +1,6 @@ +thon +from transformers import LayoutLMv2Processor +from PIL import Image +processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr") +image = Image.open( + "name_of_your_document - can be a png, jpg, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_74.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf147327975c499f5e080521221decdcd523135d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_74.txt @@ -0,0 +1 @@ +of your documents (PDFs must be converted to images)." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_75.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..47380e56b9f80e9b130002a7a32fd98097f9071a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_75.txt @@ -0,0 +1,10 @@ +).convert("RGB") +words = ["hello", "world"] +boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes +encoding = processor(image, words, boxes=boxes, return_tensors="pt") +print(encoding.keys()) +dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image']) + +Use case 3: token classification (training), apply_ocr=False +For token classification tasks (such as FUNSD, CORD, SROIE, Kleister-NDA), one can also provide the corresponding word +labels in order to train a model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_76.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..b325201d099f869205e4a84d85dfd575677755ed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_76.txt @@ -0,0 +1 @@ +The processor will then convert these into token-level labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_77.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..2df68ab2302942f1e6f62c6b6dd842b8444b466f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_77.txt @@ -0,0 +1,3 @@ +By default, it +will only label the first wordpiece of a word, and label the remaining wordpieces with -100, which is the +ignore_index of PyTorch's CrossEntropyLoss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_78.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..0315308ef6a2b8805b52b9a972cc2987b93b845e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_78.txt @@ -0,0 +1,2 @@ +In case you want all wordpieces of a word to be labeled, you can +initialize the tokenizer with only_label_first_subword set to False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_79.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..da4cd8767c20149e4341a1231eb8393c063d843f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_79.txt @@ -0,0 +1,6 @@ +thon +from transformers import LayoutLMv2Processor +from PIL import Image +processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr") +image = Image.open( + "name_of_your_document - can be a png, jpg, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_80.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf147327975c499f5e080521221decdcd523135d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_80.txt @@ -0,0 +1 @@ +of your documents (PDFs must be converted to images)." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_81.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e8db6d0832f04bef935498620094acfae410dcd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_81.txt @@ -0,0 +1,10 @@ +).convert("RGB") +words = ["hello", "world"] +boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes +word_labels = [1, 2] +encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt") +print(encoding.keys()) +dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels', 'image']) + +Use case 4: visual question answering (inference), apply_ocr=True +For visual question answering tasks (such as DocVQA), you can provide a question to the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_82.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fd4fa2be11d22fcb74cebf3f67206f82699030d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_82.txt @@ -0,0 +1,2 @@ +By default, the +processor will apply OCR on the image, and create [CLS] question tokens [SEP] word tokens [SEP]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_83.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d0b43ccfcb31412e4a6cfd8280fac6af6c20fb2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_83.txt @@ -0,0 +1,6 @@ +thon +from transformers import LayoutLMv2Processor +from PIL import Image +processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased") +image = Image.open( + "name_of_your_document - can be a png, jpg, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_84.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf147327975c499f5e080521221decdcd523135d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_84.txt @@ -0,0 +1 @@ +of your documents (PDFs must be converted to images)." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_85.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..e37c36fc4c132b191dc6994b9b2092153b132994 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_85.txt @@ -0,0 +1,2 @@ +).convert("RGB") +question = "What's his name?" \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_86.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..6df907582e888993598b1c3d19968f7cadafdb15 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_86.txt @@ -0,0 +1,6 @@ +encoding = processor(image, question, return_tensors="pt") +print(encoding.keys()) +dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image']) + +Use case 5: visual question answering (inference), apply_ocr=False +For visual question answering tasks (such as DocVQA), you can provide a question to the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_87.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd328335901cb8f566d76b1f1eb88b0abc6a560a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_87.txt @@ -0,0 +1,2 @@ +If you want to +perform OCR yourself, you can provide your own words and (normalized) bounding boxes to the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_88.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..da4cd8767c20149e4341a1231eb8393c063d843f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_88.txt @@ -0,0 +1,6 @@ +thon +from transformers import LayoutLMv2Processor +from PIL import Image +processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr") +image = Image.open( + "name_of_your_document - can be a png, jpg, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_89.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf147327975c499f5e080521221decdcd523135d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_89.txt @@ -0,0 +1 @@ +of your documents (PDFs must be converted to images)." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_90.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..e37c36fc4c132b191dc6994b9b2092153b132994 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_90.txt @@ -0,0 +1,2 @@ +).convert("RGB") +question = "What's his name?" \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_91.txt b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd6051fec36211ff45e18b6341fe949d1e839be1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv2/chunk_91.txt @@ -0,0 +1,33 @@ +words = ["hello", "world"] +boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes +encoding = processor(image, question, words, boxes=boxes, return_tensors="pt") +print(encoding.keys()) +dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image']) + +LayoutLMv2Config +[[autodoc]] LayoutLMv2Config +LayoutLMv2FeatureExtractor +[[autodoc]] LayoutLMv2FeatureExtractor + - call +LayoutLMv2ImageProcessor +[[autodoc]] LayoutLMv2ImageProcessor + - preprocess +LayoutLMv2Tokenizer +[[autodoc]] LayoutLMv2Tokenizer + - call + - save_vocabulary +LayoutLMv2TokenizerFast +[[autodoc]] LayoutLMv2TokenizerFast + - call +LayoutLMv2Processor +[[autodoc]] LayoutLMv2Processor + - call +LayoutLMv2Model +[[autodoc]] LayoutLMv2Model + - forward +LayoutLMv2ForSequenceClassification +[[autodoc]] LayoutLMv2ForSequenceClassification +LayoutLMv2ForTokenClassification +[[autodoc]] LayoutLMv2ForTokenClassification +LayoutLMv2ForQuestionAnswering +[[autodoc]] LayoutLMv2ForQuestionAnswering \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_10.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_10.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_11.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_12.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..44ac62d2c4d4b171f48715c20b5b53ec74c6154b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_12.txt @@ -0,0 +1 @@ +The TensorFlow version of this model was added by chriskoo, tokec, and lre. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_13.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_14.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a933d623125724bf28455291d00a1d26842ec62 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_14.txt @@ -0,0 +1,4 @@ +Usage tips + +In terms of data processing, LayoutLMv3 is identical to its predecessor LayoutLMv2, except that: +images need to be resized and normalized with channels in regular RGB format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_15.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2f3eb3f29bfc70e5009dd5728592800f92deb8a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_15.txt @@ -0,0 +1 @@ +LayoutLMv2 on the other hand normalizes the images internally and expects the channels in BGR format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_16.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6309631b0357c539b4b2ecdb8d34551e7b756fd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_16.txt @@ -0,0 +1 @@ +text is tokenized using byte-pair encoding (BPE), as opposed to WordPiece. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_17.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb2111dcc3fb2123d7616e81e1738a2d5c5cdf11 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_17.txt @@ -0,0 +1 @@ +Due to these differences in data preprocessing, one can use [LayoutLMv3Processor] which internally combines a [LayoutLMv3ImageProcessor] (for the image modality) and a [LayoutLMv3Tokenizer]/[LayoutLMv3TokenizerFast] (for the text modality) to prepare all data for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_18.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7221fe4b9900b8d9ab65524fb2f8d84e399c8f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_18.txt @@ -0,0 +1 @@ +Regarding usage of [LayoutLMv3Processor], we refer to the usage guide of its predecessor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_19.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..19399bc7b46b11cfa556d26acf0ef6c4f5a581c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_19.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LayoutLMv3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_20.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_20.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_21.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_21.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_22.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..d455d2e6c476a5be9416aceb42896b3efbdc93d2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_22.txt @@ -0,0 +1 @@ +LayoutLMv3 is nearly identical to LayoutLMv2, so we've also included LayoutLMv2 resources you can adapt for LayoutLMv3 tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_23.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..479a443ad3e427bd128997ae88957d89c8a1a6fa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_23.txt @@ -0,0 +1 @@ +For these notebooks, take care to use [LayoutLMv2Processor] instead when preparing data for the model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_24.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..51e47eb6a5b426d0302181d9da393765db1124c4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_24.txt @@ -0,0 +1 @@ +Demo notebooks for LayoutLMv3 can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_25.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..7eb650f4863e77465a30cdad518f94f7d77723fa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_25.txt @@ -0,0 +1 @@ +Demo scripts can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_26.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..37b65f06adf14a51a325a3294227587fef3864c3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_26.txt @@ -0,0 +1 @@ +[LayoutLMv2ForSequenceClassification] is supported by this notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_27.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9ee46b962ee732a95fc25a2a460b5de9da7b912 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_27.txt @@ -0,0 +1,3 @@ +Text classification task guide + +[LayoutLMv3ForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_28.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..94efd94e61fde515a34638838d3b73c54fff7fcd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_28.txt @@ -0,0 +1 @@ +A notebook for how to perform inference with [LayoutLMv2ForTokenClassification] and a notebook for how to perform inference when no labels are available with [LayoutLMv2ForTokenClassification]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_29.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..025c8a579cab886e2426fb9592289a01b27f95fa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_29.txt @@ -0,0 +1 @@ +A notebook for how to finetune [LayoutLMv2ForTokenClassification] with the 🤗 Trainer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_30.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7c947e85010e1b9ada4b1dfeb17fe8deee3f007 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_30.txt @@ -0,0 +1,3 @@ +Token classification task guide + +[LayoutLMv2ForQuestionAnswering] is supported by this notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_31.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..826fb2c7e3bd53ae62690c8956149d4e0117f765 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_31.txt @@ -0,0 +1,48 @@ +Question answering task guide + +Document question answering +- Document question answering task guide +LayoutLMv3Config +[[autodoc]] LayoutLMv3Config +LayoutLMv3FeatureExtractor +[[autodoc]] LayoutLMv3FeatureExtractor + - call +LayoutLMv3ImageProcessor +[[autodoc]] LayoutLMv3ImageProcessor + - preprocess +LayoutLMv3Tokenizer +[[autodoc]] LayoutLMv3Tokenizer + - call + - save_vocabulary +LayoutLMv3TokenizerFast +[[autodoc]] LayoutLMv3TokenizerFast + - call +LayoutLMv3Processor +[[autodoc]] LayoutLMv3Processor + - call + +LayoutLMv3Model +[[autodoc]] LayoutLMv3Model + - forward +LayoutLMv3ForSequenceClassification +[[autodoc]] LayoutLMv3ForSequenceClassification + - forward +LayoutLMv3ForTokenClassification +[[autodoc]] LayoutLMv3ForTokenClassification + - forward +LayoutLMv3ForQuestionAnswering +[[autodoc]] LayoutLMv3ForQuestionAnswering + - forward + +TFLayoutLMv3Model +[[autodoc]] TFLayoutLMv3Model + - call +TFLayoutLMv3ForSequenceClassification +[[autodoc]] TFLayoutLMv3ForSequenceClassification + - call +TFLayoutLMv3ForTokenClassification +[[autodoc]] TFLayoutLMv3ForTokenClassification + - call +TFLayoutLMv3ForQuestionAnswering +[[autodoc]] TFLayoutLMv3ForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_9.txt b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..114946730c8297b0a5654c959681c5c34b9f7102 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutlmv3/chunk_9.txt @@ -0,0 +1 @@ +LayoutLMv3 architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_10.txt b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..690b27709010802e9124fc5b8ce584b06c09d01d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_10.txt @@ -0,0 +1 @@ +As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to LayoutLMv2's documentation page for all tips, code examples and notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_11.txt b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..5771eb5865f451b74c9926ff77be9926c326c3e7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_11.txt @@ -0,0 +1,13 @@ +LayoutXLMTokenizer +[[autodoc]] LayoutXLMTokenizer + - call + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +LayoutXLMTokenizerFast +[[autodoc]] LayoutXLMTokenizerFast + - call +LayoutXLMProcessor +[[autodoc]] LayoutXLMProcessor + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_6.txt b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_6.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_7.txt b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_7.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_8.txt b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..94e0e73e67157cca5ad0bef2085f7aead6d6810f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_8.txt @@ -0,0 +1,8 @@ +Usage tips and examples +One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like so: +thon +from transformers import LayoutLMv2Model +model = LayoutLMv2Model.from_pretrained("microsoft/layoutxlm-base") + +Note that LayoutXLM has its own tokenizer, based on +[LayoutXLMTokenizer]/[LayoutXLMTokenizerFast]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_9.txt b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d47f6cfa032afb3801e6386093c0e743cbfb780 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_layoutxlm/chunk_9.txt @@ -0,0 +1,10 @@ +You can initialize it as +follows: +thon +from transformers import LayoutXLMTokenizer +tokenizer = LayoutXLMTokenizer.from_pretrained("microsoft/layoutxlm-base") + +Similar to LayoutLMv2, you can use [LayoutXLMProcessor] (which internally applies +[LayoutLMv2ImageProcessor] and +[LayoutXLMTokenizer]/[LayoutXLMTokenizerFast] in sequence) to prepare all +data for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_10.txt b/chunked/content_aware_chunking/model_doc_led/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ff3b09aec0adfffebcbd8f76b2cba6c67aa5ccd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_10.txt @@ -0,0 +1,2 @@ +LED works very well on long-range sequence-to-sequence tasks where the input_ids largely exceed a length of + 1024 tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_11.txt b/chunked/content_aware_chunking/model_doc_led/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f22e7656cba6c7d4c24997bab931d24c6bb68e0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_11.txt @@ -0,0 +1 @@ +LED pads the input_ids to be a multiple of config.attention_window if required. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_12.txt b/chunked/content_aware_chunking/model_doc_led/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..563d4a5470be853a557e9c21875936205542ae83 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_12.txt @@ -0,0 +1,2 @@ +Therefore a small speed-up is + gained, when [LEDTokenizer] is used with the pad_to_multiple_of argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_13.txt b/chunked/content_aware_chunking/model_doc_led/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..e334e3da2e85a51c12e3289101dd8b183951dbea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_13.txt @@ -0,0 +1,2 @@ +LED makes use of global attention by means of the global_attention_mask (see + [LongformerModel]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_14.txt b/chunked/content_aware_chunking/model_doc_led/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..53019d6e36495d4c8bdcb056f80137ad777ab93c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_14.txt @@ -0,0 +1,2 @@ +For summarization, it is advised to put global attention only on the first + token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_15.txt b/chunked/content_aware_chunking/model_doc_led/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..719ba4ff9acb96f4ed202eed56d6538ff8e5dc65 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_15.txt @@ -0,0 +1 @@ +For question answering, it is advised to put global attention on all tokens of the question. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_16.txt b/chunked/content_aware_chunking/model_doc_led/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..036bd00a2e699c00e3c71e809014007fb01416fd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_16.txt @@ -0,0 +1,2 @@ +To fine-tune LED on all 16384, gradient checkpointing can be enabled in case training leads to out-of-memory (OOM) + errors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_17.txt b/chunked/content_aware_chunking/model_doc_led/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6073f958dfcfc8dc0de30decc87c806aa0bcd5c4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_17.txt @@ -0,0 +1 @@ +This can be done by executing model.gradient_checkpointing_enable(). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_18.txt b/chunked/content_aware_chunking/model_doc_led/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..691e060777270950e009ed2a9278e1e05543f92d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_18.txt @@ -0,0 +1,2 @@ +Moreover, the use_cache=False + flag can be used to disable the caching mechanism to save memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_19.txt b/chunked/content_aware_chunking/model_doc_led/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..374806371c724c2fd9500a497adcf3f6321b457b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_19.txt @@ -0,0 +1,2 @@ +LED is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than + the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_20.txt b/chunked/content_aware_chunking/model_doc_led/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8d9a4a80c39655451ecdeff278033b32e2fe584 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_20.txt @@ -0,0 +1 @@ +This model was contributed by patrickvonplaten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_21.txt b/chunked/content_aware_chunking/model_doc_led/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d02b482778c42671f483fa1464490721fa6d646 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_21.txt @@ -0,0 +1,3 @@ +Resources + +A notebook showing how to evaluate LED. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_22.txt b/chunked/content_aware_chunking/model_doc_led/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..63a9d35a4954b69cefabd1ad040a4c968acb8a78 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_22.txt @@ -0,0 +1 @@ +A notebook showing how to fine-tune LED. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_23.txt b/chunked/content_aware_chunking/model_doc_led/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a96c61a929af9e94cecbb4c949faeb84d56363c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_23.txt @@ -0,0 +1,44 @@ +Text classification task guide +Question answering task guide +Translation task guide +Summarization task guide + +LEDConfig +[[autodoc]] LEDConfig +LEDTokenizer +[[autodoc]] LEDTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +LEDTokenizerFast +[[autodoc]] LEDTokenizerFast +LED specific outputs +[[autodoc]] models.led.modeling_led.LEDEncoderBaseModelOutput +[[autodoc]] models.led.modeling_led.LEDSeq2SeqModelOutput +[[autodoc]] models.led.modeling_led.LEDSeq2SeqLMOutput +[[autodoc]] models.led.modeling_led.LEDSeq2SeqSequenceClassifierOutput +[[autodoc]] models.led.modeling_led.LEDSeq2SeqQuestionAnsweringModelOutput +[[autodoc]] models.led.modeling_tf_led.TFLEDEncoderBaseModelOutput +[[autodoc]] models.led.modeling_tf_led.TFLEDSeq2SeqModelOutput +[[autodoc]] models.led.modeling_tf_led.TFLEDSeq2SeqLMOutput + +LEDModel +[[autodoc]] LEDModel + - forward +LEDForConditionalGeneration +[[autodoc]] LEDForConditionalGeneration + - forward +LEDForSequenceClassification +[[autodoc]] LEDForSequenceClassification + - forward +LEDForQuestionAnswering +[[autodoc]] LEDForQuestionAnswering + - forward + +TFLEDModel +[[autodoc]] TFLEDModel + - call +TFLEDForConditionalGeneration +[[autodoc]] TFLEDForConditionalGeneration + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_7.txt b/chunked/content_aware_chunking/model_doc_led/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..32a94c860bc4161f169876f1c183722a81ded735 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_7.txt @@ -0,0 +1,3 @@ +We finally introduce the Longformer-Encoder-Decoder (LED), a Longformer variant for supporting +long document generative sequence-to-sequence tasks, and demonstrate its effectiveness on the arXiv summarization +dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_8.txt b/chunked/content_aware_chunking/model_doc_led/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f122d275e5095ad042d0688065ca339c7b729f0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_8.txt @@ -0,0 +1,5 @@ +Usage tips + +[LEDForConditionalGeneration] is an extension of + [BartForConditionalGeneration] exchanging the traditional self-attention layer with + Longformer's chunked self-attention layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_led/chunk_9.txt b/chunked/content_aware_chunking/model_doc_led/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..0cc1be2ae5129a193954d267b8bd9b9b21388e10 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_led/chunk_9.txt @@ -0,0 +1,2 @@ +[LEDTokenizer] is an alias of + [BartTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_11.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..483c3080d7c590a2bc3c60a92d46b7dabeb50599 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_11.txt @@ -0,0 +1,3 @@ +* + + LeViT Architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_12.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_12.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_13.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..48ec3b90e5f98587018baf8373027433c9d221d6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by anugunj. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_14.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_15.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..da25d9bd4d738f2116bbfc7561d96c4f39410ce0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_15.txt @@ -0,0 +1,3 @@ +Usage tips + +Compared to ViT, LeViT models use an additional distillation head to effectively learn from a teacher (which, in the LeViT paper, is a ResNet like-model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_16.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..776c5ce5d07dc6d9cfca19d9d7ccbece042b2025 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_16.txt @@ -0,0 +1 @@ +The distillation head is learned through backpropagation under supervision of a ResNet like-model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_17.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c6b49b4b30470b72c198536252d4c2f010cf197 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_17.txt @@ -0,0 +1 @@ +They also draw inspiration from convolution neural networks to use activation maps with decreasing resolutions to increase the efficiency. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_18.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..5658b8b0b84c8955a4b672961f4d590166256727 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_18.txt @@ -0,0 +1,3 @@ +There are 2 ways to fine-tune distilled models, either (1) in a classic way, by only placing a prediction head on top + of the final hidden state and not using the distillation head, or (2) by placing both a prediction head and distillation + head on top of the final hidden state. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_19.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..c57a42722ba4a7360ea1a4bfddefb3f3e5e84497 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_19.txt @@ -0,0 +1,3 @@ +In that case, the prediction head is trained using regular cross-entropy between + the prediction of the head and the ground-truth label, while the distillation prediction head is trained using hard distillation + (cross-entropy between the prediction of the distillation head and the label predicted by the teacher). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_20.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2cbec8289b057b9b73a97c36ecae25d89ba2311 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_20.txt @@ -0,0 +1,2 @@ +At inference time, + one takes the average prediction between both heads as final prediction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_21.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d512dbd95a9cdc636db91b7f031940516439811e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_21.txt @@ -0,0 +1,2 @@ +(2) is also called "fine-tuning with distillation", + because one relies on a teacher that has already been fine-tuned on the downstream dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_22.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e2747a5e5e980f71f77d64ccb0139ef1e0e45b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_22.txt @@ -0,0 +1,2 @@ +In terms of models, (1) corresponds + to [LevitForImageClassification] and (2) corresponds to [LevitForImageClassificationWithTeacher]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_23.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1a5e4aa55eebbc464e97140e408bfd125326a24 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_23.txt @@ -0,0 +1,2 @@ +All released checkpoints were pre-trained and fine-tuned on ImageNet-1k + (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_24.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7fece2b3f220124906b608405b74af83c31edbb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_24.txt @@ -0,0 +1 @@ +only. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_25.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfc40976a4f63372d853e16085cb220b2f4d1934 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_25.txt @@ -0,0 +1 @@ +No external data was used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_26.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..45a644e2560959b3b63c5f91326d28a38584ee13 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_26.txt @@ -0,0 +1,3 @@ +This is in + contrast with the original ViT model, which used external data like the JFT-300M dataset/Imagenet-21k for + pre-training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_27.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..38547407df9a08efa6c9c387c586df6f752ba8f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_27.txt @@ -0,0 +1 @@ +The authors of LeViT released 5 trained LeViT models, which you can directly plug into [LevitModel] or [LevitForImageClassification]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_28.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d570c5e690efec56b430ad5725c332481beac0b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_28.txt @@ -0,0 +1,2 @@ +Techniques like data augmentation, optimization, and regularization were used in order to simulate training on a much larger dataset + (while only using ImageNet-1k for pre-training). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_29.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..f975ec614be0f487e4da683d65bd72065322e06b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_29.txt @@ -0,0 +1,3 @@ +The 5 variants available are (all trained on images of size 224x224): + facebook/levit-128S, facebook/levit-128, facebook/levit-192, facebook/levit-256 and + facebook/levit-384. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_30.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..b51d1ac26c6973dc961a059c683ada3f61eb1326 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_30.txt @@ -0,0 +1,2 @@ +Note that one should use [LevitImageProcessor] in order to + prepare images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_31.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4578ade575df025ba8a920b30e1345a9aee397f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_31.txt @@ -0,0 +1 @@ +[LevitForImageClassificationWithTeacher] currently supports only inference and not training or fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_32.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b70df3dfb7d83a4099919dfecd33f13a9756fa2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_32.txt @@ -0,0 +1,2 @@ +You can check out demo notebooks regarding inference as well as fine-tuning on custom data here + (you can just replace [ViTFeatureExtractor] by [LevitImageProcessor] and [ViTForImageClassification] by [LevitForImageClassification] or [LevitForImageClassificationWithTeacher]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_33.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc8a1aaa48c1cd1e9bf484eb562090e9e54e6832 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_33.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LeViT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_34.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcf244b9980d244d5d3452064e280100ef491265 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_34.txt @@ -0,0 +1 @@ +[LevitForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_35.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_35.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_36.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_36.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_levit/chunk_37.txt b/chunked/content_aware_chunking/model_doc_levit/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2d4bb158c4012cf7ec4bdc504169c74eba1775f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_levit/chunk_37.txt @@ -0,0 +1,17 @@ +LevitConfig +[[autodoc]] LevitConfig +LevitFeatureExtractor +[[autodoc]] LevitFeatureExtractor + - call +LevitImageProcessor +[[autodoc]] LevitImageProcessor + - preprocess +LevitModel +[[autodoc]] LevitModel + - forward +LevitForImageClassification +[[autodoc]] LevitForImageClassification + - forward +LevitForImageClassificationWithTeacher +[[autodoc]] LevitForImageClassificationWithTeacher + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_10.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a9067a2eb1888db3bf7a536a087d806aa674f09 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_11.txt @@ -0,0 +1,3 @@ +Usage tips + +To combine the Language-Independent Layout Transformer with a new RoBERTa checkpoint from the hub, refer to this guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..4631a6890cc0aa69082b8be1e93b3a0614eacc46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_12.txt @@ -0,0 +1 @@ +The script will result in config.json and pytorch_model.bin files being stored locally. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..0912c5894d6761d6b4d381d07ab831dbc3c749e5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_13.txt @@ -0,0 +1,8 @@ +After doing this, one can do the following (assuming you're logged in with your HuggingFace account): + +thon +from transformers import LiltModel +model = LiltModel.from_pretrained("path_to_your_files") +model.push_to_hub("name_of_repo_on_the_hub") + +When preparing data for the model, make sure to use the token vocabulary that corresponds to the RoBERTa checkpoint you combined with the Layout Transformer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc2b16e354c70e22b2c4d197dd92baa60d519bc9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_14.txt @@ -0,0 +1 @@ +As lilt-roberta-en-base uses the same vocabulary as LayoutLMv3, one can use [LayoutLMv3TokenizerFast] to prepare data for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_15.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..68f4eca44ff95b99ac796143a133fdb82d9f211a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_15.txt @@ -0,0 +1 @@ +The same is true for lilt-roberta-en-base: one can use [LayoutXLMTokenizerFast] for that model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_16.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc664b90a616187b2b776b431dff1f20fc7655ae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_16.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LiLT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_17.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..b54c11a4a1b1a29bfc129b02e66ad43040c0a5e6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_17.txt @@ -0,0 +1 @@ +Demo notebooks for LiLT can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_18.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..27d997b9f26349f9bfb29fc5af0cec53c130dd1c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_18.txt @@ -0,0 +1,5 @@ +Documentation resources +- Text classification task guide +- Token classification task guide +- Question answering task guide +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_19.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_19.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_20.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..05f8856b750ccce8b3556afcf5fc0b901f786c96 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_20.txt @@ -0,0 +1,14 @@ +LiltConfig +[[autodoc]] LiltConfig +LiltModel +[[autodoc]] LiltModel + - forward +LiltForSequenceClassification +[[autodoc]] LiltForSequenceClassification + - forward +LiltForTokenClassification +[[autodoc]] LiltForTokenClassification + - forward +LiltForQuestionAnswering +[[autodoc]] LiltForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_6.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f26f57aceaba0f6ce04cc23cc620efd809f2831 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_6.txt @@ -0,0 +1 @@ +Experimental results on eight languages have shown that LiLT can achieve competitive or even superior performance on diverse widely-used downstream benchmarks, which enables language-independent benefit from the pre-training of document layout structure. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_7.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..225a53e9c04ff04315d2700f574db1b52d6e9945 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_7.txt @@ -0,0 +1 @@ +LiLT architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_8.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_8.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lilt/chunk_9.txt b/chunked/content_aware_chunking/model_doc_lilt/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lilt/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_11.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..08ed4da9c18a7f7b1e4e5457f6cfdeaf9b9dc7f8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_11.txt @@ -0,0 +1 @@ +For the 65B model, it's thus 130GB of RAM needed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_12.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecebfbb96862a6f3c5f508f4de2fe53d06f7d6e2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_12.txt @@ -0,0 +1 @@ +The LLaMA tokenizer is a BPE model based on sentencepiece. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_13.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..374006591f8097ecca75b46bb3007c5fd9fd8426 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_13.txt @@ -0,0 +1 @@ +One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_14.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..62fb268d0481eb7d8c2c42849d091bd15c50e4bf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_14.txt @@ -0,0 +1 @@ +"Banana"), the tokenizer does not prepend the prefix space to the string. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_15.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cb28d851fcfbaf5a7ca7abe354814cc10bce611 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_15.txt @@ -0,0 +1 @@ +This model was contributed by zphang with contributions from BlackSamorez. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_16.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..45b2d6b3dcad18527ef853c7992f757d3b799a26 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_16.txt @@ -0,0 +1 @@ +The code of the implementation in Hugging Face is based on GPT-NeoX here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_17.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..0106595019f9ad5e4a1cdf2d88f2cb0307876cfc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_17.txt @@ -0,0 +1 @@ +The original code of the authors can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_18.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..241fe1242331607277b323d9c69dd40873635b87 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_18.txt @@ -0,0 +1 @@ +The Flax version of the implementation was contributed by afmck with the code in the implementation based on Hugging Face's Flax GPT-Neo. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_19.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebf205a04ac9c259ef704f1c55aedd24e24ab901 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_19.txt @@ -0,0 +1,3 @@ +Based on the original LLaMA model, Meta AI has released some follow-up works: + +Llama2: Llama2 is an improved version of Llama with some architectural tweaks (Grouped Query Attention), and is pre-trained on 2Trillion tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_20.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0986644db24bbb36c763a87fb304fb32a9c5fa3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_20.txt @@ -0,0 +1 @@ +Refer to the documentation of Llama2 which can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_21.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..eacda1a74d495c33ed0f8e5dc7c404b374c69074 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_21.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LLaMA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_22.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_22.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_23.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_23.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_24.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..fce2b8d4487a75bf2cc24d1120a68dcc2dc92026 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_24.txt @@ -0,0 +1 @@ +A notebook on how to use prompt tuning to adapt the LLaMA model for text classification task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_25.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..884bfe155044ab23001323d674c78b72d00e188c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_25.txt @@ -0,0 +1,3 @@ +🌎 + +StackLLaMA: A hands-on guide to train LLaMA with RLHF, a blog post about how to train LLaMA to answer questions on Stack Exchange with RLHF. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_26.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cb7e030326c7ea98e28a135b1320f710f5a0f37 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_26.txt @@ -0,0 +1,2 @@ +âš—ï¸ Optimization +- A notebook on how to fine-tune LLaMA model using xturing library on GPU which has limited memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_27.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b58c009fb2dbf6364b6bf9c2504e4df67b7b7cd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_27.txt @@ -0,0 +1,3 @@ +🌎 +âš¡ï¸ Inference +- A notebook on how to run the LLaMA Model using PeftModel from the 🤗 PEFT library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_28.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..25c31517a8985ef50e496351a102817c24e1ed9d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_28.txt @@ -0,0 +1,2 @@ +🌎 +- A notebook on how to load a PEFT adapter LLaMA model with LangChain. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_29.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac3e34dd5eed753859b59e24b80a80c3de5aa18b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_29.txt @@ -0,0 +1,3 @@ +🌎 +🚀 Deploy +- A notebook on how to fine-tune LLaMA model using LoRA method via the 🤗 PEFT library with intuitive UI. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_30.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ec8c41b7b87eaa852e34e54121eaffa0f00f9fe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_30.txt @@ -0,0 +1,2 @@ +🌎 +- A notebook on how to deploy Open-LLaMA model for text generation on Amazon SageMaker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama/chunk_31.txt b/chunked/content_aware_chunking/model_doc_llama/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e9f43265fe93527aebc92ba8f41fb878d6ab445 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama/chunk_31.txt @@ -0,0 +1,34 @@ +🌎 +LlamaConfig +[[autodoc]] LlamaConfig +LlamaTokenizer +[[autodoc]] LlamaTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +LlamaTokenizerFast +[[autodoc]] LlamaTokenizerFast + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - update_post_processor + - save_vocabulary +LlamaModel +[[autodoc]] LlamaModel + - forward +LlamaForCausalLM +[[autodoc]] LlamaForCausalLM + - forward +LlamaForSequenceClassification +[[autodoc]] LlamaForSequenceClassification + - forward +LlamaForQuestionAnswering +[[autodoc]] LlamaForQuestionAnswering + - forward +FlaxLlamaModel +[[autodoc]] FlaxLlamaModel + - call +FlaxLlamaForCausalLM +[[autodoc]] FlaxLlamaForCausalLM + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_18.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b2f03fa72bb331a5c2aa2b24cd33b54948d4193 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_18.txt @@ -0,0 +1 @@ +You should also set the model.config.pad_token_id. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_19.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..842af9edd2f7e140095d42bf8df3a7885310c0ee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_19.txt @@ -0,0 +1 @@ +The embed_tokens layer of the model is initialized with self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.config.padding_idx), which makes sure that encoding the padding token will output zeros, so passing it when initializing is recommended. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_20.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..49eb0c69f6e1d427c684436e00846548aecccde1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_20.txt @@ -0,0 +1 @@ +After filling out the form and gaining access to the model checkpoints, you should be able to use the already converted checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_21.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..2473a202a5450f7908b214e24e7cb0afd0682b17 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_21.txt @@ -0,0 +1 @@ +Otherwise, if you are converting your own model, feel free to use the conversion script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_22.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..878ea58fb938cb033bd39a261574f459ab7120ad --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_22.txt @@ -0,0 +1,14 @@ +The script can be called with the following (example) command: + +python src/transformers/models/llama/convert_llama_weights_to_hf.py \ + --input_dir /path/to/downloaded/llama/weights --model_size 7B --output_dir /output/path + +After conversion, the model and tokenizer can be loaded via: + +thon +from transformers import LlamaForCausalLM, LlamaTokenizer +tokenizer = LlamaTokenizer.from_pretrained("/output/path") +model = LlamaForCausalLM.from_pretrained("/output/path") + +Note that executing the script requires enough CPU RAM to host the whole model in float16 precision (even if the biggest versions +come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_23.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..4157e83fe4650fb589aa9ed477bfe6c7e60a6414 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_23.txt @@ -0,0 +1 @@ +For the 75B model, it's thus 145GB of RAM needed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_24.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecebfbb96862a6f3c5f508f4de2fe53d06f7d6e2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_24.txt @@ -0,0 +1 @@ +The LLaMA tokenizer is a BPE model based on sentencepiece. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_25.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..374006591f8097ecca75b46bb3007c5fd9fd8426 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_25.txt @@ -0,0 +1 @@ +One quirk of sentencepiece is that when decoding a sequence, if the first token is the start of the word (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_26.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..62fb268d0481eb7d8c2c42849d091bd15c50e4bf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_26.txt @@ -0,0 +1 @@ +"Banana"), the tokenizer does not prepend the prefix space to the string. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_27.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..a72d863fe0a3f7ec359140f47a11acc88e66729b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_27.txt @@ -0,0 +1 @@ +When using Flash Attention 2 via attn_implementation="flash_attention_2", don't pass torch_dtype to the from_pretrained class method and use Automatic Mixed-Precision training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_28.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..736bc45e781b530f4ad89d258f6009f52eac3548 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_28.txt @@ -0,0 +1 @@ +When using Trainer, it is simply specifying either fp16 or bf16 to True. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_29.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..028c0c5673fb3ba9ded9f455f16749c5eb67e3bc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_29.txt @@ -0,0 +1 @@ +Otherwise, make sure you are using torch.autocast. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_30.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..46c40acb2555823bf573063b218e8c91f6970a8f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_30.txt @@ -0,0 +1 @@ +This is required because the Flash Attention only support fp16 and bf16 data type. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_31.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb68e860e7930946d82130f68e150bb7bb354939 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_31.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with LLaMA2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_32.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_32.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_33.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_33.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_34.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..40cc5488ac796cdcb7ea016d6174f2a23c828624 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_34.txt @@ -0,0 +1 @@ +Llama 2 is here - get it on Hugging Face, a blog post about Llama 2 and how to use it with 🤗 Transformers and 🤗 PEFT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_35.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ff45fa8be0b2801c79b5172beff104b25b6b799 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_35.txt @@ -0,0 +1 @@ +LLaMA 2 - Every Resource you need, a compilation of relevant resources to learn about LLaMA 2 and how to get started quickly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_36.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..8aa2168ea2f50fdd1763dfa661b920f396206196 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_36.txt @@ -0,0 +1 @@ +A notebook on how to fine-tune Llama 2 in Google Colab using QLoRA and 4-bit precision. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_37.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d982f9b78586b300c34ec865f677c9b46feda53 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_37.txt @@ -0,0 +1,2 @@ +🌎 +A notebook on how to fine-tune the "Llama-v2-7b-guanaco" model with 4-bit QLoRA and generate Q&A datasets from PDFs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_38.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..91410317d278fb3a46a506ae9311ee18c1492d79 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_38.txt @@ -0,0 +1,3 @@ +🌎 + +A notebook on how to fine-tune the Llama 2 model with QLoRa, TRL, and Korean text classification dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_39.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..6358a6b2e6c84dbdc47eb636209997b7eed4d3ae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_39.txt @@ -0,0 +1,4 @@ +🌎🇰🇷 + +âš—ï¸ Optimization +- Fine-tune Llama 2 with DPO, a guide to using the TRL library's DPO method to fine tune Llama 2 on a specific dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_40.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e6052c39daed44d4a67698da34b11f3c543833e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_40.txt @@ -0,0 +1 @@ +- Extended Guide: Instruction-tune Llama 2, a guide to training Llama 2 to generate instructions from inputs, transforming the model from instruction-following to instruction-giving. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_41.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2d9ca51bd4f1bec2b000138e885e5c78aa529ca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_41.txt @@ -0,0 +1 @@ +- A notebook on how to fine-tune the Llama 2 model on a personal computer using QLoRa and TRL. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_42.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..0743aad90d0697d0cd5ad9210ec7045873529f0b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_42.txt @@ -0,0 +1,3 @@ +🌎 +âš¡ï¸ Inference +- A notebook on how to quantize the Llama 2 model using GPTQ from the AutoGPTQ library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_43.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..26ae2d669697b6d3bff7f46b4c0dd5882e1d3318 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_43.txt @@ -0,0 +1,2 @@ +🌎 +- A notebook on how to run the Llama 2 Chat Model with 4-bit quantization on a local computer or Google Colab. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_44.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b378d6780b813f3b71fd2302759f031b5a390f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_44.txt @@ -0,0 +1,3 @@ +🌎 +🚀 Deploy +- Fine-tune LLaMA 2 (7-70B) on Amazon SageMaker, a complete guide from setup to QLoRA fine-tuning and deployment on Amazon SageMaker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_45.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f9e236a8b150d0f65d103fb85da889454c27d7d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_45.txt @@ -0,0 +1 @@ +- Deploy Llama 2 7B/13B/70B on Amazon SageMaker, a guide on using Hugging Face's LLM DLC container for secure and scalable deployment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llama2/chunk_46.txt b/chunked/content_aware_chunking/model_doc_llama2/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2313f5adf634dc85b3959c301db1d6d6b6eceee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llama2/chunk_46.txt @@ -0,0 +1,24 @@ +LlamaConfig +[[autodoc]] LlamaConfig +LlamaTokenizer +[[autodoc]] LlamaTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +LlamaTokenizerFast +[[autodoc]] LlamaTokenizerFast + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - update_post_processor + - save_vocabulary +LlamaModel +[[autodoc]] LlamaModel + - forward +LlamaForCausalLM +[[autodoc]] LlamaForCausalLM + - forward +LlamaForSequenceClassification +[[autodoc]] LlamaForSequenceClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_10.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_10.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_11.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3e37c2e1a520209ad300d80dc6d554ac96687ef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by ArthurZ and ybelkada. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_12.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_13.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c5e5e24ef5322b5824cc830e54a32ae386f9a35 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_13.txt @@ -0,0 +1,3 @@ +Usage tips + +We advise users to use padding_side="left" when computing batched generation as it leads to more accurate results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_14.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ef3d9de7ea075c290668be11d569e1b2469e319 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_14.txt @@ -0,0 +1 @@ +Simply make sure to call processor.tokenizer.padding_side = "left" before generating. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_15.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ce634499e3cf81b02d05e767a50200c8eb0537d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_15.txt @@ -0,0 +1 @@ +Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_16.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..91196ab2b11252711dc67806a94e58013186f328 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_16.txt @@ -0,0 +1,8 @@ +For better results, we recommend users to prompt the model with the correct prompt format: + +"USER: \nASSISTANT:" +For multiple turns conversation: + +"USER: \nASSISTANT: USER: ASSISTANT: USER: ASSISTANT:" +Using Flash Attention 2 +Flash Attention 2 is an even faster, optimized version of the previous optimization, please refer to the Flash Attention 2 section of performance docs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_17.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..56bb4d130a68be69ecdab5c08f83a0d57618df44 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_17.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with BEiT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_18.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d4269eff3a0e48a246224369315e1973549ae8d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_18.txt @@ -0,0 +1 @@ +A Google Colab demo on how to run Llava on a free-tier Google colab instance leveraging 4-bit inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_19.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8535f21b0674d4000f48165b7d4c277519b337b1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_19.txt @@ -0,0 +1 @@ +A similar notebook showcasing batched inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_20.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..05c18cd813258419e50f34919b291386b6f0443f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_20.txt @@ -0,0 +1,9 @@ +🌎 + +LlavaConfig +[[autodoc]] LlavaConfig +LlavaProcessor +[[autodoc]] LlavaProcessor +LlavaForConditionalGeneration +[[autodoc]] LlavaForConditionalGeneration + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_6.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..49d5344b649cae2b1c14b4acf4621a2a09877d7c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_6.txt @@ -0,0 +1 @@ +With simple modifications to LLaVA, namely, using CLIP-ViT-L-336px with an MLP projection and adding academic-task-oriented VQA data with simple response formatting prompts, we establish stronger baselines that achieve state-of-the-art across 11 benchmarks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_7.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbde6185762971c0ce583e952467d858e22a34b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_7.txt @@ -0,0 +1 @@ +Our final 13B checkpoint uses merely 1.2M publicly available data, and finishes full training in ∼1 day on a single 8-A100 node. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_8.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ab9d1dcdba40b5ea25408994cc2a30a614c2c56 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_8.txt @@ -0,0 +1 @@ +We hope this can make state-of-the-art LMM research more accessible. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_llava/chunk_9.txt b/chunked/content_aware_chunking/model_doc_llava/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..a25a41c45fc67d14c1e3f58f1c740f55abe8f9bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_llava/chunk_9.txt @@ -0,0 +1,3 @@ +Code and model will be publicly available + + LLaVa architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..0cde193bcaf06d3f93c945bed8608b72937180cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_10.txt @@ -0,0 +1,2 @@ +You don't need to indicate which + token belongs to which segment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..f13c3ccc78c88cb3f8ccf21c0fdae4ed31c2a989 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_11.txt @@ -0,0 +1,2 @@ +Just separate your segments with the separation token tokenizer.sep_token (or + ). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4da2910559af45fc7b69bd9fcc2de3c4dd69d21 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_12.txt @@ -0,0 +1 @@ +A transformer model replacing the attention matrices by sparse matrices to go faster. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..564c5fd3926cec39a03b2f8b3cc893361888266c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_13.txt @@ -0,0 +1 @@ +Often, the local context (e.g., what are the two tokens left and right?) \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad907bafe20835ee3a61aa5b296dc993745a0e93 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_14.txt @@ -0,0 +1 @@ +is enough to take action for a given token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..5822076f008efd7147a48b413d6e88d017430e95 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_15.txt @@ -0,0 +1 @@ +Some preselected input tokens are still given global attention, but the attention matrix has way less parameters, resulting in a speed-up. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7879e8ba461af97badcf136733b7b00528eb0a1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_16.txt @@ -0,0 +1 @@ +See the local attention section for more information. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_17.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..96802917e4cbb76adebcdab8b8a80f44de1488b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_17.txt @@ -0,0 +1,2 @@ +Longformer Self Attention +Longformer self attention employs self attention on both a "local" context and a "global" context. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_18.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e7652e7d408354e65b476f2b2c5b95658a498d4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_18.txt @@ -0,0 +1,4 @@ +Most tokens only +attend "locally" to each other meaning that each token attends to its \(\frac{1}{2} w\) previous tokens and +\(\frac{1}{2} w\) succeeding tokens with \(w\) being the window length as defined in +config.attention_window. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_19.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..08e5da5b037575be05f57718b9be553e01fc631a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_19.txt @@ -0,0 +1,2 @@ +Note that config.attention_window can be of type List to define a +different \(w\) for each layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_20.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3f706a7e6434978f720f1f523cdad05a40d9e76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_20.txt @@ -0,0 +1,2 @@ +A selected few tokens attend "globally" to all other tokens, as it is +conventionally done for all tokens in BertSelfAttention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_21.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..986d902255ff536927278b907e271d68560da2c2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_21.txt @@ -0,0 +1 @@ +Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_22.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2245208b3af1470c06bfc9de6a395160640174b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_22.txt @@ -0,0 +1,3 @@ +Also note +that every "locally" attending token not only attends to tokens within its window \(w\), but also to all "globally" +attending tokens so that global attention is symmetric. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_23.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cd317e7b230b183298630ddeefe71dadae35f3e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_23.txt @@ -0,0 +1,2 @@ +The user can define which tokens attend "locally" and which tokens attend "globally" by setting the tensor +global_attention_mask at run-time appropriately. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_24.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..adffbe42724bffe6de146bbc4d46e8e6c549d9cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_24.txt @@ -0,0 +1,5 @@ +All Longformer models employ the following logic for +global_attention_mask: + +0: the token attends "locally", +1: the token attends "globally". \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_25.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..f12a99d6dce8438e4cb2da104236ad8a38b70b1e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_25.txt @@ -0,0 +1 @@ +For more information please also refer to [~LongformerModel.forward] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_26.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..77d7f8a155caf2828088eeeeb0bbfb6d98e6c85c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_26.txt @@ -0,0 +1,4 @@ +Using Longformer self attention, the memory and time complexity of the query-key matmul operation, which usually +represents the memory and time bottleneck, can be reduced from \(\mathcal{O}(n_s \times n_s)\) to +\(\mathcal{O}(n_s \times w)\), with \(n_s\) being the sequence length and \(w\) being the average window +size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_27.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..4830b3d2d5bb80375626b2c69dd88f7b4e4801c2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_27.txt @@ -0,0 +1,2 @@ +It is assumed that the number of "globally" attending tokens is insignificant as compared to the number of +"locally" attending tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_28.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..543e50a5fa631c4b71d374777584327969fb956f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_28.txt @@ -0,0 +1 @@ +For more information, please refer to the official paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longformer/chunk_29.txt b/chunked/content_aware_chunking/model_doc_longformer/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..88cc4dbc3abd39311eab144f03543e9e7d72b280 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longformer/chunk_29.txt @@ -0,0 +1,75 @@ +Training +[LongformerForMaskedLM] is trained the exact same way [RobertaForMaskedLM] is +trained and should be used as follows: +thon +input_ids = tokenizer.encode("This is a sentence from [MASK] training data", return_tensors="pt") +mlm_labels = tokenizer.encode("This is a sentence from the training data", return_tensors="pt") +loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0] + +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +LongformerConfig +[[autodoc]] LongformerConfig +LongformerTokenizer +[[autodoc]] LongformerTokenizer +LongformerTokenizerFast +[[autodoc]] LongformerTokenizerFast +Longformer specific outputs +[[autodoc]] models.longformer.modeling_longformer.LongformerBaseModelOutput +[[autodoc]] models.longformer.modeling_longformer.LongformerBaseModelOutputWithPooling +[[autodoc]] models.longformer.modeling_longformer.LongformerMaskedLMOutput +[[autodoc]] models.longformer.modeling_longformer.LongformerQuestionAnsweringModelOutput +[[autodoc]] models.longformer.modeling_longformer.LongformerSequenceClassifierOutput +[[autodoc]] models.longformer.modeling_longformer.LongformerMultipleChoiceModelOutput +[[autodoc]] models.longformer.modeling_longformer.LongformerTokenClassifierOutput +[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutput +[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutputWithPooling +[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerMaskedLMOutput +[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerQuestionAnsweringModelOutput +[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerSequenceClassifierOutput +[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerMultipleChoiceModelOutput +[[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerTokenClassifierOutput + +LongformerModel +[[autodoc]] LongformerModel + - forward +LongformerForMaskedLM +[[autodoc]] LongformerForMaskedLM + - forward +LongformerForSequenceClassification +[[autodoc]] LongformerForSequenceClassification + - forward +LongformerForMultipleChoice +[[autodoc]] LongformerForMultipleChoice + - forward +LongformerForTokenClassification +[[autodoc]] LongformerForTokenClassification + - forward +LongformerForQuestionAnswering +[[autodoc]] LongformerForQuestionAnswering + - forward + +TFLongformerModel +[[autodoc]] TFLongformerModel + - call +TFLongformerForMaskedLM +[[autodoc]] TFLongformerForMaskedLM + - call +TFLongformerForQuestionAnswering +[[autodoc]] TFLongformerForQuestionAnswering + - call +TFLongformerForSequenceClassification +[[autodoc]] TFLongformerForSequenceClassification + - call +TFLongformerForTokenClassification +[[autodoc]] TFLongformerForTokenClassification + - call +TFLongformerForMultipleChoice +[[autodoc]] TFLongformerForMultipleChoice + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_10.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b864eecbc8959af3878027ebfc69fcdfcf15597 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_10.txt @@ -0,0 +1,4 @@ +Usage tips + +[LongT5ForConditionalGeneration] is an extension of [T5ForConditionalGeneration] exchanging the traditional +encoder self-attention layer with efficient either local attention or transient-global (tglobal) attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_11.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..5661515907c853451abb49c5bfd9726722812c5b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_11.txt @@ -0,0 +1 @@ +Unlike the T5 model, LongT5 does not use a task prefix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_12.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..8776759d30cdcdb3398ec7c47dfb1f6d6222da74 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_12.txt @@ -0,0 +1,2 @@ +Furthermore, it uses a different pre-training objective +inspired by the pre-training of [PegasusForConditionalGeneration]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_13.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..7726dec0515c453980a600f4e448fcb03549aeae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_13.txt @@ -0,0 +1,2 @@ +LongT5 model is designed to work efficiently and very well on long-range sequence-to-sequence tasks where the +input sequence exceeds commonly used 512 tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_14.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac4a1834bc4afd1e1916b463a59b49d8d14d4d2f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_14.txt @@ -0,0 +1 @@ +It is capable of handling input sequences of a length up to 16,384 tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_15.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..daaf611b6171c5b473c438329e91680e42cf037a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_15.txt @@ -0,0 +1,2 @@ +For Local Attention, the sparse sliding-window local attention operation allows a given token to attend only r +tokens to the left and right of it (with r=127 by default). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_16.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f4eb7c0362e12d411c28c5f608b0f7b001649d2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_16.txt @@ -0,0 +1,2 @@ +Local Attention does not introduce any new parameters +to the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_17.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..19bce6f38e5d10f09051759d010b30004dd92103 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_17.txt @@ -0,0 +1 @@ +The complexity of the mechanism is linear in input sequence length l: O(l*r). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_18.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..a59d888bbc3b2e0896d9e463ab49be89b685fbc5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_18.txt @@ -0,0 +1 @@ +Transient Global Attention is an extension of the Local Attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_19.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfdf813a9475f6e7bef0eea8b9c36fb2cd6ecf36 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_19.txt @@ -0,0 +1,2 @@ +It, furthermore, allows each input token to +interact with all other tokens in the layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_20.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fd1b43fa0e8e95493234db29868192ae49de83d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_20.txt @@ -0,0 +1,2 @@ +This is achieved via splitting an input sequence into blocks of a fixed +length k (with a default k=16). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_21.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..5eceb9e73c97ef498f04961328db89b3d8dce467 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_21.txt @@ -0,0 +1,2 @@ +Then, a global token for such a block is obtained via summing and normalizing the embeddings of every token +in the block. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_22.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..5603eae29d411ef6562aaeb51d567b109abfea0a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_22.txt @@ -0,0 +1,3 @@ +Thanks to this, the attention allows each token to attend to both nearby tokens like in Local attention, and +also every global token like in the case of standard global attention (transient represents the fact the global tokens +are constructed dynamically within each attention operation). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_23.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..97b02e3f4c4494b366e710a8ff5c7e0dc6d17411 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_23.txt @@ -0,0 +1,2 @@ +As a consequence, TGlobal attention introduces +a few new parameters -- global relative position biases and a layer normalization for global token's embedding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_24.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..37be6cc331130fbea620fa10e0a37b3b01525306 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_24.txt @@ -0,0 +1 @@ +The complexity of this mechanism is O(l(r + l/k)). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_25.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d9cf618b79f99b8c4a2b22d778cd40da09612b2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_25.txt @@ -0,0 +1 @@ +An example showing how to evaluate a fine-tuned LongT5 model on the pubmed dataset is below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_longt5/chunk_26.txt b/chunked/content_aware_chunking/model_doc_longt5/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3522ffa3683af1e597284d80beb9139c34e2649 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_longt5/chunk_26.txt @@ -0,0 +1,53 @@ +thon + +import evaluate +from datasets import load_dataset +from transformers import AutoTokenizer, LongT5ForConditionalGeneration +dataset = load_dataset("scientific_papers", "pubmed", split="validation") +model = ( + LongT5ForConditionalGeneration.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps") + .to("cuda") + .half() + ) +tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps") +def generate_answers(batch): + inputs_dict = tokenizer( + batch["article"], max_length=16384, padding="max_length", truncation=True, return_tensors="pt" + ) + input_ids = inputs_dict.input_ids.to("cuda") + attention_mask = inputs_dict.attention_mask.to("cuda") + output_ids = model.generate(input_ids, attention_mask=attention_mask, max_length=512, num_beams=2) + batch["predicted_abstract"] = tokenizer.batch_decode(output_ids, skip_special_tokens=True) + return batch +result = dataset.map(generate_answer, batched=True, batch_size=2) +rouge = evaluate.load("rouge") +rouge.compute(predictions=result["predicted_abstract"], references=result["abstract"]) + +Resources + +Translation task guide +Summarization task guide + +LongT5Config +[[autodoc]] LongT5Config + +LongT5Model +[[autodoc]] LongT5Model + - forward +LongT5ForConditionalGeneration +[[autodoc]] LongT5ForConditionalGeneration + - forward +LongT5EncoderModel +[[autodoc]] LongT5EncoderModel + - forward + +FlaxLongT5Model +[[autodoc]] FlaxLongT5Model + - call + - encode + - decode +FlaxLongT5ForConditionalGeneration +[[autodoc]] FlaxLongT5ForConditionalGeneration + - call + - encode + - decode \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_13.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b091102a4a4a136f539d93ac9a1aca153899722 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_13.txt @@ -0,0 +1,2 @@ +LUKE treats entities as input tokens; therefore, it takes entity_ids, entity_attention_mask, + entity_token_type_ids and entity_position_ids as extra input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_14.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..077a0ea1e726a4c444112a3e1ebca64b0ae3eb6e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_14.txt @@ -0,0 +1,2 @@ +You can obtain those using + [LukeTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_15.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5944e3e6b77838b636212c429b2b2fd6b62f404 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_15.txt @@ -0,0 +1,2 @@ +[LukeTokenizer] takes entities and entity_spans (character-based start and end + positions of the entities in the input text) as extra input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_16.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d142c30669583c376867f6c82ccf919ccc755ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_16.txt @@ -0,0 +1,2 @@ +entities typically consist of [MASK] entities or + Wikipedia entities. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_17.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..54b5e4894d80253b156bd3d4e7663e2cbe99a97e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_17.txt @@ -0,0 +1,4 @@ +The brief description when inputting these entities are as follows: + +Inputting [MASK] entities to compute entity representations: The [MASK] entity is used to mask entities to be + predicted during pretraining. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_18.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..18e658a4395829e6968733ac3d7baa39765b9bf5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_18.txt @@ -0,0 +1,2 @@ +When LUKE receives the [MASK] entity, it tries to predict the original entity by + gathering the information about the entity from the input text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_19.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..668609e77e50363b894731a8b318eaa7aaadd5b2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_19.txt @@ -0,0 +1,3 @@ +Therefore, the [MASK] entity can be used to address + downstream tasks requiring the information of entities in text such as entity typing, relation classification, and + named entity recognition. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_20.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..1866ed73ca6ee5a98fe586fe68396591f7d367f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_20.txt @@ -0,0 +1,2 @@ +Inputting Wikipedia entities to compute knowledge-enhanced token representations: LUKE learns rich information + (or knowledge) about Wikipedia entities during pretraining and stores the information in its entity embedding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_21.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..0053e7498545285022e6239680542a89478ecb83 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_21.txt @@ -0,0 +1,3 @@ +By + using Wikipedia entities as input tokens, LUKE outputs token representations enriched by the information stored in + the embeddings of these entities. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_22.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..20a9012aaece21934e7dfd5c3e1c72073a240811 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_22.txt @@ -0,0 +1,2 @@ +This is particularly effective for tasks requiring real-world knowledge, such as + question answering. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_23.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ffc36b07da8806e3a9e493255ccd86183caf539 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_23.txt @@ -0,0 +1,4 @@ +There are three head models for the former use case: + +[LukeForEntityClassification], for tasks to classify a single entity in an input text such as + entity typing, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_24.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..39b2b2dff3e8a729d273554527f6fd3c129041f7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_24.txt @@ -0,0 +1 @@ +the Open Entity dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_25.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e635176e505990ae112aee5cc5d0da49973ac15 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_25.txt @@ -0,0 +1 @@ +This model places a linear head on top of the output entity representation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_26.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..402fa891490e45506fd7c1f7101181d33c0ffb6a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_26.txt @@ -0,0 +1,2 @@ +[LukeForEntityPairClassification], for tasks to classify the relationship between two entities + such as relation classification, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_27.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2facc863b18fd6765ac009cca04263c2800b3fe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_27.txt @@ -0,0 +1 @@ +the TACRED dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_28.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..d40b75782ae3b4cdd562f351bd6ddb7a4b3611e9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_28.txt @@ -0,0 +1,2 @@ +This + model places a linear head on top of the concatenated output representation of the pair of given entities. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_29.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fe2cc6a4130a01fff202d7348383e554e1f4b1b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_29.txt @@ -0,0 +1,2 @@ +[LukeForEntitySpanClassification], for tasks to classify the sequence of entity spans, such as + named entity recognition (NER). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_30.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ecf856b79b11e3ff8a75ca4103168e27db2480c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_30.txt @@ -0,0 +1 @@ +This model places a linear head on top of the output entity representations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_31.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d2bf88e99a2cb1549676d249e10aa0c0d90cace --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_31.txt @@ -0,0 +1,2 @@ +You + can address NER using this model by inputting all possible entity spans in the text to the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_32.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..22792fabeb176528b687cd3eb19f53d7a5bd43d5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_32.txt @@ -0,0 +1,3 @@ +[LukeTokenizer] has a task argument, which enables you to easily create an input to these + head models by specifying task="entity_classification", task="entity_pair_classification", or + task="entity_span_classification". \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_33.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a38465b6b6185ba2828043fbe9f77a1da392187 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_33.txt @@ -0,0 +1 @@ +Please refer to the example code of each head models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_34.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..54aacf520282cd5720b35150a5c29077fdc8e4a5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_34.txt @@ -0,0 +1,10 @@ +Usage example: +thon + +from transformers import LukeTokenizer, LukeModel, LukeForEntityPairClassification +model = LukeModel.from_pretrained("studio-ousia/luke-base") +tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-base") + +Example 1: Computing the contextualized entity representation corresponding to the entity mention "Beyoncé" + +text = "Beyoncé lives in Los Angeles." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_luke/chunk_35.txt b/chunked/content_aware_chunking/model_doc_luke/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..9708d8f979a0f31da6b2fccf6a5a30364bee88b9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_luke/chunk_35.txt @@ -0,0 +1,72 @@ +entity_spans = [(0, 7)] # character-based entity span corresponding to "Beyoncé" +inputs = tokenizer(text, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt") +outputs = model(**inputs) +word_last_hidden_state = outputs.last_hidden_state +entity_last_hidden_state = outputs.entity_last_hidden_state + +Example 2: Inputting Wikipedia entities to obtain enriched contextualized representations + +entities = [ + "Beyoncé", + "Los Angeles", + ] # Wikipedia entity titles corresponding to the entity mentions "Beyoncé" and "Los Angeles" +entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles" +inputs = tokenizer(text, entities=entities, entity_spans=entity_spans, add_prefix_space=True, return_tensors="pt") +outputs = model(**inputs) +word_last_hidden_state = outputs.last_hidden_state +entity_last_hidden_state = outputs.entity_last_hidden_state + +Example 3: Classifying the relationship between two entities using LukeForEntityPairClassification head model + +model = LukeForEntityPairClassification.from_pretrained("studio-ousia/luke-large-finetuned-tacred") +tokenizer = LukeTokenizer.from_pretrained("studio-ousia/luke-large-finetuned-tacred") +entity_spans = [(0, 7), (17, 28)] # character-based entity spans corresponding to "Beyoncé" and "Los Angeles" +inputs = tokenizer(text, entity_spans=entity_spans, return_tensors="pt") +outputs = model(**inputs) +logits = outputs.logits +predicted_class_idx = int(logits[0].argmax()) +print("Predicted class:", model.config.id2label[predicted_class_idx]) + +Resources + +A demo notebook on how to fine-tune [LukeForEntityPairClassification] for relation classification +Notebooks showcasing how you to reproduce the results as reported in the paper with the HuggingFace implementation of LUKE +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +LukeConfig +[[autodoc]] LukeConfig +LukeTokenizer +[[autodoc]] LukeTokenizer + - call + - save_vocabulary +LukeModel +[[autodoc]] LukeModel + - forward +LukeForMaskedLM +[[autodoc]] LukeForMaskedLM + - forward +LukeForEntityClassification +[[autodoc]] LukeForEntityClassification + - forward +LukeForEntityPairClassification +[[autodoc]] LukeForEntityPairClassification + - forward +LukeForEntitySpanClassification +[[autodoc]] LukeForEntitySpanClassification + - forward +LukeForSequenceClassification +[[autodoc]] LukeForSequenceClassification + - forward +LukeForMultipleChoice +[[autodoc]] LukeForMultipleChoice + - forward +LukeForTokenClassification +[[autodoc]] LukeForTokenClassification + - forward +LukeForQuestionAnswering +[[autodoc]] LukeForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lxmert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_lxmert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7199f80344eece0f2dd44eb0a3aa4ed434b14aa5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lxmert/chunk_10.txt @@ -0,0 +1,4 @@ +Lastly, we demonstrate detailed ablation studies to prove that both our novel +model components and pretraining strategies significantly contribute to our strong results; and also present several +attention visualizations for the different encoders +This model was contributed by eltoto1219. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lxmert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_lxmert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lxmert/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lxmert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_lxmert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb3398000ff4c2685dcc9025b0d656b2bb87adda --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lxmert/chunk_12.txt @@ -0,0 +1,4 @@ +Usage tips + +Bounding boxes are not necessary to be used in the visual feature embeddings, any kind of visual-spacial features + will work. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lxmert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_lxmert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..e183e57efd49b1da536cec22966e6f1fb5a75bde --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lxmert/chunk_13.txt @@ -0,0 +1,2 @@ +Both the language hidden states and the visual hidden states that LXMERT outputs are passed through the + cross-modality layer, so they contain information from both modalities. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lxmert/chunk_14.txt b/chunked/content_aware_chunking/model_doc_lxmert/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd52ab1f869bc88406d6cbdf46004a821057f6a0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lxmert/chunk_14.txt @@ -0,0 +1,2 @@ +To access a modality that only attends to + itself, select the vision/language hidden states from the first input in the tuple. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lxmert/chunk_15.txt b/chunked/content_aware_chunking/model_doc_lxmert/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..c359d22285c8bcb2897d5440fb50ce0a180f9ca4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lxmert/chunk_15.txt @@ -0,0 +1,2 @@ +The bidirectional cross-modality encoder attention only returns attention values when the language modality is used + as the input and the vision modality is used as the context vector. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lxmert/chunk_16.txt b/chunked/content_aware_chunking/model_doc_lxmert/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..9522e66d159278828e00869de345e817347cffe4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lxmert/chunk_16.txt @@ -0,0 +1,3 @@ +Further, while the cross-modality encoder + contains self-attention for each respective modality and cross-attention, only the cross attention is returned and + both self attention outputs are disregarded. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_lxmert/chunk_17.txt b/chunked/content_aware_chunking/model_doc_lxmert/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..4931bfba22ad807fb3f45432444e6271f1d20d9f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_lxmert/chunk_17.txt @@ -0,0 +1,33 @@ +Resources + +Question answering task guide + +LxmertConfig +[[autodoc]] LxmertConfig +LxmertTokenizer +[[autodoc]] LxmertTokenizer +LxmertTokenizerFast +[[autodoc]] LxmertTokenizerFast +Lxmert specific outputs +[[autodoc]] models.lxmert.modeling_lxmert.LxmertModelOutput +[[autodoc]] models.lxmert.modeling_lxmert.LxmertForPreTrainingOutput +[[autodoc]] models.lxmert.modeling_lxmert.LxmertForQuestionAnsweringOutput +[[autodoc]] models.lxmert.modeling_tf_lxmert.TFLxmertModelOutput +[[autodoc]] models.lxmert.modeling_tf_lxmert.TFLxmertForPreTrainingOutput + +LxmertModel +[[autodoc]] LxmertModel + - forward +LxmertForPreTraining +[[autodoc]] LxmertForPreTraining + - forward +LxmertForQuestionAnswering +[[autodoc]] LxmertForQuestionAnswering + - forward + +TFLxmertModel +[[autodoc]] TFLxmertModel + - call +TFLxmertForPreTraining +[[autodoc]] TFLxmertForPreTraining + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_10.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..58dfb540788f5cf90933db923993eeeefa70c180 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_10.txt @@ -0,0 +1,2 @@ +Usage tips and examples +M2M100 is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_11.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e0547ee03b73ad8c43a1f0706a2ccec8d213b9c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_11.txt @@ -0,0 +1,3 @@ +As the model is +multilingual it expects the sequences in a certain format: A special language id token is used as prefix in both the +source and target text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_12.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..b88756ec6f33578bc4d394ac30afd1c10ef2f8f0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_12.txt @@ -0,0 +1,2 @@ +The source text format is [lang_code] X [eos], where lang_code is source language +id for source text and target language id for target text, with X being the source or target text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_13.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..9deb11b91ef6bd682a474c1cd264caed54fe3ed2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_13.txt @@ -0,0 +1,2 @@ +The [M2M100Tokenizer] depends on sentencepiece so be sure to install it before running the +examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_14.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..89ac7a95162911d48f095f1a99c99c61bc223cb7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_14.txt @@ -0,0 +1 @@ +To install sentencepiece run pip install sentencepiece. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_15.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..0cceb9f4d73497f47c5aaabd00d9bda14006f8db --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_15.txt @@ -0,0 +1,6 @@ +Supervised Training +thon +from transformers import M2M100Config, M2M100ForConditionalGeneration, M2M100Tokenizer +model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") +tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="en", tgt_lang="fr") +src_text = "Life is like a box of chocolates." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_16.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d81c6bf2cb49ed8a985873c0299dfae9b2ba2046 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_16.txt @@ -0,0 +1 @@ +tgt_text = "La vie est comme une boîte de chocolat." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_17.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e6da6ecb550e60c8634cf8ff5bbd0c60ddfd609 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_17.txt @@ -0,0 +1,6 @@ +model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") +loss = model(**model_inputs).loss # forward pass + +Generation +M2M100 uses the eos_token_id as the decoder_start_token_id for generation with the target language id +being forced as the first generated token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_18.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..4402e228f4438496e4d56df4b192ed3dad5879ee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_18.txt @@ -0,0 +1,2 @@ +To force the target language id as the first generated token, pass the +forced_bos_token_id parameter to the generate method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_19.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..01ca528e1435007b708e0a7e05ffc8f70f552e12 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_19.txt @@ -0,0 +1,2 @@ +The following example shows how to translate between +Hindi to French and Chinese to English using the facebook/m2m100_418M checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_20.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..27293b7011968b7fa263645a34ccb16a2f5fa451 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_20.txt @@ -0,0 +1,13 @@ +thon + +from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer +hi_text = "जीवन à¤à¤• चॉकलेट बॉकà¥à¤¸ की तरह है।" +chinese_text = "生活就åƒä¸€ç›’巧克力。" +model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") +tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M") +translate Hindi to French +tokenizer.src_lang = "hi" +encoded_hi = tokenizer(hi_text, return_tensors="pt") +generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.get_lang_id("fr")) +tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +"La vie est comme une boîte de chocolat." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_21.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c89ab4d29a62a346410bf17b247ca48d0185ea8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_21.txt @@ -0,0 +1,6 @@ +translate Chinese to English +tokenizer.src_lang = "zh" +encoded_zh = tokenizer(chinese_text, return_tensors="pt") +generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) +tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +"Life is like a box of chocolate." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_22.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..37461590a9833b2ffe749b7ab8e650b37fc938e2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_22.txt @@ -0,0 +1,19 @@ +Resources + +Translation task guide +Summarization task guide + +M2M100Config +[[autodoc]] M2M100Config +M2M100Tokenizer +[[autodoc]] M2M100Tokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +M2M100Model +[[autodoc]] M2M100Model + - forward +M2M100ForConditionalGeneration +[[autodoc]] M2M100ForConditionalGeneration + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_m2m_100/chunk_9.txt b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1bd8890a67017e8946e3bb4f5ce5f1b685e242a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_m2m_100/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by valhalla. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_madlad-400/chunk_10.txt b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..41e77d9522ba5659b60307e3169099a86576f013 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_10.txt @@ -0,0 +1,4 @@ +", return_tensors="pt") +outputs = model.generate(**inputs) +print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) +['Eu amo pizza!'] \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_madlad-400/chunk_11.txt b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed1afd01528386aba9e39a9c564e2f182018e674 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_11.txt @@ -0,0 +1,11 @@ +Google has released the following variants: + +google/madlad400-3b-mt + +google/madlad400-7b-mt + +google/madlad400-7b-mt-bt + +google/madlad400-10b-mt + +The original checkpoints can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_madlad-400/chunk_12.txt b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e1fdf45462297516407b979930187d91ff11c5a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_12.txt @@ -0,0 +1 @@ +Refer to T5's documentation page for all API references, code examples, and notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_madlad-400/chunk_13.txt b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..e192004d830b0fed5c3ec6fc148cf2558f4dca76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_13.txt @@ -0,0 +1 @@ +For more details regarding training and evaluation of the MADLAD-400, refer to the model card. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_madlad-400/chunk_5.txt b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..260c8925ead064766cbd1b5299e417428102978b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_5.txt @@ -0,0 +1,2 @@ +We make the baseline models 1 +available to the research community. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_madlad-400/chunk_6.txt b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5326b207ab6e131194a8dbffbec9aad6e1a786a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_6.txt @@ -0,0 +1 @@ +This model was added by Juarez Bochi. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_madlad-400/chunk_7.txt b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..259b9a2fdf2cc00ef761f1878e58809b024e55bf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_7.txt @@ -0,0 +1 @@ +The original checkpoints can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_madlad-400/chunk_8.txt b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..887a470ce35b1e9ed372bd175c8e21b765b89129 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_8.txt @@ -0,0 +1 @@ +This is a machine translation model that supports many low-resource languages, and that is competitive with models that are significantly larger. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_madlad-400/chunk_9.txt b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..9088ad9f0bba4287a03b5a7e1063074d20c7d12a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_madlad-400/chunk_9.txt @@ -0,0 +1,7 @@ +One can directly use MADLAD-400 weights without finetuning the model: +thon + +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +model = AutoModelForSeq2SeqLM.from_pretrained("google/madlad400-3b-mt") +tokenizer = AutoTokenizer.from_pretrained("google/madlad400-3b-mt") +inputs = tokenizer("<2pt> I love pizza! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_10.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c6d39efd5fcef95fe644a7442c26aea450d2003 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_10.txt @@ -0,0 +1,4 @@ +Naming + +All model names use the following format: Helsinki-NLP/opus-mt-{src}-{tgt} +The language codes used to name models are inconsistent. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_11.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fa1eed9c97528800cded2fd27197cef1758ce07 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_11.txt @@ -0,0 +1,2 @@ +Two digit codes can usually be found here, three digit codes require googling "language + code {code}". \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_12.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..06ea7ea2028c02710177895c4562c0680a8964d6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_12.txt @@ -0,0 +1 @@ +Codes formatted like es_AR are usually code_{region}. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_13.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f509f4381634102dc9a7dbde87af1aae81e0229 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_13.txt @@ -0,0 +1 @@ +That one is Spanish from Argentina. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_14.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..b83b1157c69b8bb754b2e9e6df454f258ff809d0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_14.txt @@ -0,0 +1 @@ +The models were converted in two stages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_15.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb497180187669de6d11e5f5bef7ebe677997f77 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_15.txt @@ -0,0 +1,2 @@ +The first 1000 models use ISO-639-2 codes to identify languages, the second + group use a combination of ISO-639-5 codes and ISO-639-2 codes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_16.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..a46dd8a56bbec31294911596803cf12b5147459c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_16.txt @@ -0,0 +1,4 @@ +Examples + +Since Marian models are smaller than many other translation models available in the library, they can be useful for + fine-tuning experiments and integration tests. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_17.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..c47837a4c26e7aeff375604a275940d42dd5d816 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_17.txt @@ -0,0 +1,7 @@ +Fine-tune on GPU + +Multilingual Models + +All model names use the following format: Helsinki-NLP/opus-mt-{src}-{tgt}: +If a model can output multiple languages, and you should specify a language code by prepending the desired output + language to the src_text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_18.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9ff2f2b22511da3463db932de59b462f20b6242 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_18.txt @@ -0,0 +1 @@ +You can see a models's supported language codes in its model card, under target constituents, like in opus-mt-en-roa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_19.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..7301c5262da17b7467e8d67f5e334c94a2b05f83 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_19.txt @@ -0,0 +1,2 @@ +Note that if a model is only multilingual on the source side, like Helsinki-NLP/opus-mt-roa-en, no language + codes are required. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_20.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..327e7453235df61b591dc3bec7489b01c3bd703c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_20.txt @@ -0,0 +1,19 @@ +New multi-lingual models from the Tatoeba-Challenge repo +require 3 character language codes: +thon + +from transformers import MarianMTModel, MarianTokenizer +src_text = [ + ">>fra<< this is a sentence in english that we want to translate to french", + ">>por<< This should go to portuguese", + ">>esp<< And this to Spanish", + ] +model_name = "Helsinki-NLP/opus-mt-en-roa" +tokenizer = MarianTokenizer.from_pretrained(model_name) +print(tokenizer.supported_language_codes) +['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<'] +model = MarianMTModel.from_pretrained(model_name) +translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True)) +[tokenizer.decode(t, skip_special_tokens=True) for t in translated] +["c'est une phrase en anglais que nous voulons traduire en français", + 'Isto deve ir para o português. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_21.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..5803d58a7153298e887948beed1d6c38584425e8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_21.txt @@ -0,0 +1,53 @@ +', + 'Y esto al español'] + +Here is the code to see all available pretrained models on the hub: +thon +from huggingface_hub import list_models +model_list = list_models() +org = "Helsinki-NLP" +model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)] +suffix = [x.split("/")[1] for x in model_ids] +old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()] + +Old Style Multi-Lingual Models +These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language +group: +python no-style +['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU', + 'Helsinki-NLP/opus-mt-ROMANCE-en', + 'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA', + 'Helsinki-NLP/opus-mt-de-ZH', + 'Helsinki-NLP/opus-mt-en-CELTIC', + 'Helsinki-NLP/opus-mt-en-ROMANCE', + 'Helsinki-NLP/opus-mt-es-NORWAY', + 'Helsinki-NLP/opus-mt-fi-NORWAY', + 'Helsinki-NLP/opus-mt-fi-ZH', + 'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI', + 'Helsinki-NLP/opus-mt-sv-NORWAY', + 'Helsinki-NLP/opus-mt-sv-ZH'] +GROUP_MEMBERS = { + 'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'], + 'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'], + 'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'], + 'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'], + 'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'], + 'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'], + 'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv'] +} +Example of translating english to many romance languages, using old-style 2 character language codes +thon + +from transformers import MarianMTModel, MarianTokenizer +src_text = [ + ">>fr<< this is a sentence in english that we want to translate to french", + ">>pt<< This should go to portuguese", + ">>es<< And this to Spanish", + ] +model_name = "Helsinki-NLP/opus-mt-en-ROMANCE" +tokenizer = MarianTokenizer.from_pretrained(model_name) +model = MarianMTModel.from_pretrained(model_name) +translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True)) +tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated] +["c'est une phrase en anglais que nous voulons traduire en français", + 'Isto deve ir para o português. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_22.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac9d490eb3fb1760793a95d4d54b25e6539a8664 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_22.txt @@ -0,0 +1,38 @@ +', + 'Y esto al español'] + +Resources + +Translation task guide +Summarization task guide +Causal language modeling task guide + +MarianConfig +[[autodoc]] MarianConfig +MarianTokenizer +[[autodoc]] MarianTokenizer + - build_inputs_with_special_tokens + +MarianModel +[[autodoc]] MarianModel + - forward +MarianMTModel +[[autodoc]] MarianMTModel + - forward +MarianForCausalLM +[[autodoc]] MarianForCausalLM + - forward + +TFMarianModel +[[autodoc]] TFMarianModel + - call +TFMarianMTModel +[[autodoc]] TFMarianMTModel + - call + +FlaxMarianModel +[[autodoc]] FlaxMarianModel + - call +FlaxMarianMTModel +[[autodoc]] FlaxMarianMTModel + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_5.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..761fc43cb99ded872f9bbeb4ff8fbd759b295897 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_5.txt @@ -0,0 +1 @@ +Models were originally trained by Jörg Tiedemann using the Marian C++ library, which supports fast training and translation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_6.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..678ffe82172b546cfb65c0ebdd3432ecf469cdaf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_6.txt @@ -0,0 +1 @@ +All models are transformer encoder-decoders with 6 layers in each component. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_7.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..745281473b3940f7472134cdfb7b25aca35130cd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_7.txt @@ -0,0 +1,2 @@ +Each model's performance is documented + in a model card. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_8.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4b702b30394e7817f5b3913c4ed9efd2c1b1cc7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_8.txt @@ -0,0 +1 @@ +The 80 opus models that require BPE preprocessing are not supported. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_marian/chunk_9.txt b/chunked/content_aware_chunking/model_doc_marian/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c256d8ae04f9fd8b96048e33ed109f580225ee6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_marian/chunk_9.txt @@ -0,0 +1,8 @@ +The modeling code is the same as [BartForConditionalGeneration] with a few minor modifications: + +static (sinusoid) positional embeddings (MarianConfig.static_position_embeddings=True) + +no layernorm_embedding (MarianConfig.normalize_embedding=False) +the model starts generating with pad_token_id (which has 0 as a token_embedding) as the prefix (Bart uses + ), +Code to bulk convert models can be found in convert_marian_to_pytorch.py. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_17.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..1962d0695de9ff32eb41c80e7fa7b71b45471e31 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_17.txt @@ -0,0 +1,3 @@ +Usage: MarkupLMProcessor +The easiest way to prepare data for the model is to use [MarkupLMProcessor], which internally combines a feature extractor +([MarkupLMFeatureExtractor]) and a tokenizer ([MarkupLMTokenizer] or [MarkupLMTokenizerFast]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_18.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..31d027b39649cf65b30f98d1a59fd9ef6728235f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_18.txt @@ -0,0 +1,3 @@ +The feature extractor is +used to extract all nodes and xpaths from the HTML strings, which are then provided to the tokenizer, which turns them into the +token-level inputs of the model (input_ids etc.). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_19.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd5a71c7b421f2344e2bcf7814be7df814ba0528 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_19.txt @@ -0,0 +1,2 @@ +Note that you can still use the feature extractor and tokenizer separately, +if you only want to handle one of the two tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_20.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe375dd41ae41e164a5e3659579e20035b68a355 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_20.txt @@ -0,0 +1,8 @@ +thon +from transformers import MarkupLMFeatureExtractor, MarkupLMTokenizerFast, MarkupLMProcessor +feature_extractor = MarkupLMFeatureExtractor() +tokenizer = MarkupLMTokenizerFast.from_pretrained("microsoft/markuplm-base") +processor = MarkupLMProcessor(feature_extractor, tokenizer) + +In short, one can provide HTML strings (and possibly additional data) to [MarkupLMProcessor], +and it will create the inputs expected by the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_21.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea128c24aacffae6f65ae84a86efde7295d4bcda --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_21.txt @@ -0,0 +1,2 @@ +Internally, the processor first uses +[MarkupLMFeatureExtractor] to get a list of nodes and corresponding xpaths. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_22.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..367bf52de2f1550ea5898ba0cefba8183897cf29 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_22.txt @@ -0,0 +1,3 @@ +The nodes and +xpaths are then provided to [MarkupLMTokenizer] or [MarkupLMTokenizerFast], which converts them +to token-level input_ids, attention_mask, token_type_ids, xpath_subs_seq, xpath_tags_seq. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_23.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..9876776f79236320d8dc099d4c1df1ece406dd51 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_23.txt @@ -0,0 +1 @@ +Optionally, one can provide node labels to the processor, which are turned into token-level labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_24.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..656fd8610254eb77eeefd80958528f240154d2fb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_24.txt @@ -0,0 +1,2 @@ +[MarkupLMFeatureExtractor] uses Beautiful Soup, a Python library for +pulling data out of HTML and XML files, under the hood. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_25.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..57e40eee9a23a951b92381aaa0a5bf7ae1499089 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_25.txt @@ -0,0 +1,2 @@ +Note that you can still use your own parsing solution of +choice, and provide the nodes and xpaths yourself to [MarkupLMTokenizer] or [MarkupLMTokenizerFast]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_26.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b5541f8bd7f5eb0180a23f5163bcdef407e3570 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_26.txt @@ -0,0 +1 @@ +In total, there are 5 use cases that are supported by the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_27.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..e615b20e64512fd039f3ae01c2c2b77df0e22068 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_27.txt @@ -0,0 +1 @@ +Below, we list them all. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_28.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..b096c20bea5ea96d175a4179b378b9e51468de66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_28.txt @@ -0,0 +1,2 @@ +Note that each of these +use cases work for both batched and non-batched inputs (we illustrate them for non-batched inputs). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_29.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..d112a1f9c1ae5590d6da99fb6ef7bcb42b8e4b38 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_29.txt @@ -0,0 +1,2 @@ +Use case 1: web page classification (training, inference) + token classification (inference), parse_html = True +This is the simplest case, in which the processor will use the feature extractor to get all nodes and xpaths from the HTML. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_30.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..44b4bdffd5910e534c282cb863ad83d4450cd36a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_30.txt @@ -0,0 +1,15 @@ +thon + +from transformers import MarkupLMProcessor +processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base") +html_string = """ + + + + Hello world + + + Welcome + Here is my website. + + """ \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_31.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..01b14aca1641e679a8725d52d0b33116ae1aa3e0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_31.txt @@ -0,0 +1,7 @@ +note that you can also add provide all tokenizer parameters here such as padding, truncation +encoding = processor(html_string, return_tensors="pt") +print(encoding.keys()) +dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq']) + +Use case 2: web page classification (training, inference) + token classification (inference), parse_html=False +In case one already has obtained all nodes and xpaths, one doesn't need the feature extractor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_32.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..31be8d44c59b24db971ade62096be04a6cbcc88f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_32.txt @@ -0,0 +1,2 @@ +In that case, one should +provide the nodes and corresponding xpaths themselves to the processor, and make sure to set parse_html to False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_33.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..653c514e37cbd9638f42f590cf8d9ca4d33b4252 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_33.txt @@ -0,0 +1,14 @@ +thon + +from transformers import MarkupLMProcessor +processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base") +processor.parse_html = False +nodes = ["hello", "world", "how", "are"] +xpaths = ["/html/body/div/li[1]/div/span", "/html/body/div/li[1]/div/span", "html/body", "html/body/div"] +encoding = processor(nodes=nodes, xpaths=xpaths, return_tensors="pt") +print(encoding.keys()) +dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq']) + +Use case 3: token classification (training), parse_html=False +For token classification tasks (such as SWDE), one can also provide the +corresponding node labels in order to train a model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_34.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..b325201d099f869205e4a84d85dfd575677755ed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_34.txt @@ -0,0 +1 @@ +The processor will then convert these into token-level labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_35.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..c94310fe706ec441c24bc97cf62989c3ac32c860 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_35.txt @@ -0,0 +1,2 @@ +By default, it will only label the first wordpiece of a word, and label the remaining wordpieces with -100, which is the +ignore_index of PyTorch's CrossEntropyLoss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_36.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..0315308ef6a2b8805b52b9a972cc2987b93b845e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_36.txt @@ -0,0 +1,2 @@ +In case you want all wordpieces of a word to be labeled, you can +initialize the tokenizer with only_label_first_subword set to False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_37.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..50eb5adf851536ee7b35f106af7187f79043980d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_37.txt @@ -0,0 +1,14 @@ +thon + +from transformers import MarkupLMProcessor +processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base") +processor.parse_html = False +nodes = ["hello", "world", "how", "are"] +xpaths = ["/html/body/div/li[1]/div/span", "/html/body/div/li[1]/div/span", "html/body", "html/body/div"] +node_labels = [1, 2, 2, 1] +encoding = processor(nodes=nodes, xpaths=xpaths, node_labels=node_labels, return_tensors="pt") +print(encoding.keys()) +dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq', 'labels']) + +Use case 4: web page question answering (inference), parse_html=True +For question answering tasks on web pages, you can provide a question to the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_38.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcafdcd686d1d6fed43bd3a3923b5ffec9aa4ba4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_38.txt @@ -0,0 +1,2 @@ +By default, the +processor will use the feature extractor to get all nodes and xpaths, and create [CLS] question tokens [SEP] word tokens [SEP]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_39.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..d163406be2fe86ee7383fb4f24b5e867970397fa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_39.txt @@ -0,0 +1,15 @@ +thon + +from transformers import MarkupLMProcessor +processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base") +html_string = """ + + + + Hello world + + + Welcome + My name is Niels. + + """ \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_40.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..c76182d1303cb5de3aaccf29a9247de2d05558fe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_40.txt @@ -0,0 +1 @@ +question = "What's his name?" \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_41.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..6842d69b2fcacc535b49e8698709ce9c6f737f69 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_41.txt @@ -0,0 +1,6 @@ +encoding = processor(html_string, questions=question, return_tensors="pt") +print(encoding.keys()) +dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq']) + +Use case 5: web page question answering (inference), parse_html=False +For question answering tasks (such as WebSRC), you can provide a question to the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_42.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..914a0845bb27b20486561c3a15461eb63c6bc880 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_42.txt @@ -0,0 +1,2 @@ +If you have extracted +all nodes and xpaths yourself, you can provide them directly to the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_43.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..84067d3f8a5c70c81e1ecd45df80550891a42c98 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_43.txt @@ -0,0 +1 @@ +Make sure to set parse_html to False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_44.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9e6b47f96a8a31e013cdb2b57e3e9cf099d9b55 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_44.txt @@ -0,0 +1,8 @@ +thon + +from transformers import MarkupLMProcessor +processor = MarkupLMProcessor.from_pretrained("microsoft/markuplm-base") +processor.parse_html = False +nodes = ["hello", "world", "how", "are"] +xpaths = ["/html/body/div/li[1]/div/span", "/html/body/div/li[1]/div/span", "html/body", "html/body/div"] +question = "What's his name?" \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_markuplm/chunk_45.txt b/chunked/content_aware_chunking/model_doc_markuplm/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ee16555c7cd6e7574516b0b9c33ad3048b3a8e0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_markuplm/chunk_45.txt @@ -0,0 +1,41 @@ +encoding = processor(nodes=nodes, xpaths=xpaths, questions=question, return_tensors="pt") +print(encoding.keys()) +dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'xpath_tags_seq', 'xpath_subs_seq']) + +Resources + +Demo notebooks +Text classification task guide +Token classification task guide +Question answering task guide + +MarkupLMConfig +[[autodoc]] MarkupLMConfig + - all +MarkupLMFeatureExtractor +[[autodoc]] MarkupLMFeatureExtractor + - call +MarkupLMTokenizer +[[autodoc]] MarkupLMTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +MarkupLMTokenizerFast +[[autodoc]] MarkupLMTokenizerFast + - all +MarkupLMProcessor +[[autodoc]] MarkupLMProcessor + - call +MarkupLMModel +[[autodoc]] MarkupLMModel + - forward +MarkupLMForSequenceClassification +[[autodoc]] MarkupLMForSequenceClassification + - forward +MarkupLMForTokenClassification +[[autodoc]] MarkupLMForTokenClassification + - forward +MarkupLMForQuestionAnswering +[[autodoc]] MarkupLMForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_10.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab1fcc23d73de2a243b069e2d1a0b7b5ca0690a3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by Shivalika Singh and Alara Dirik. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..34098471d17fc2e95a2174e098084ff708e16e23 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_13.txt @@ -0,0 +1,3 @@ +Usage tips + +Mask2Former uses the same preprocessing and postprocessing steps as MaskFormer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7acf4f5720c09b37d459c14d83afba4aba225ec4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_14.txt @@ -0,0 +1 @@ +Use [Mask2FormerImageProcessor] or [AutoImageProcessor] to prepare images and optional targets for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b74d29fa4a65bac713c63825bee22f4a9e16599 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_15.txt @@ -0,0 +1 @@ +To get the final segmentation, depending on the task, you can call [~Mask2FormerImageProcessor.post_process_semantic_segmentation] or [~Mask2FormerImageProcessor.post_process_instance_segmentation] or [~Mask2FormerImageProcessor.post_process_panoptic_segmentation]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a916e0734cdfc6e19137796e005092e1a077f37 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_16.txt @@ -0,0 +1 @@ +All three tasks can be solved using [Mask2FormerForUniversalSegmentation] output, panoptic segmentation accepts an optional label_ids_to_fuse argument to fuse instances of the target object/s (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_17.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8748f43c1dfe000b767514d7874060aa06935397 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_17.txt @@ -0,0 +1 @@ +sky) together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_18.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..81f35c317ec4f0fc7184f41f10701b9269aa7442 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_18.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Mask2Former. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_19.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..00579ed17ac0c853b5218b8c0df67a4ae61ae915 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_19.txt @@ -0,0 +1 @@ +Demo notebooks regarding inference + fine-tuning Mask2Former on custom data can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_20.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7b997fc410fec140a8b1f0a0f029593247bf8b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_20.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_21.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_21.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_22.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ed0523b0a0cd9507cfbcda124b26fbd4f3fa263 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_22.txt @@ -0,0 +1,18 @@ +Mask2FormerConfig +[[autodoc]] Mask2FormerConfig +MaskFormer specific outputs +[[autodoc]] models.mask2former.modeling_mask2former.Mask2FormerModelOutput +[[autodoc]] models.mask2former.modeling_mask2former.Mask2FormerForUniversalSegmentationOutput +Mask2FormerModel +[[autodoc]] Mask2FormerModel + - forward +Mask2FormerForUniversalSegmentation +[[autodoc]] Mask2FormerForUniversalSegmentation + - forward +Mask2FormerImageProcessor +[[autodoc]] Mask2FormerImageProcessor + - preprocess + - encode_inputs + - post_process_semantic_segmentation + - post_process_instance_segmentation + - post_process_panoptic_segmentation \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..10b2b737252d134ec34792f5dd04a796e04747d8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_7.txt @@ -0,0 +1 @@ +In addition to reducing the research effort by at least three times, it outperforms the best specialized architectures by a significant margin on four popular datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..5173cef1edad600e8b56d71d7041825e8bf2e733 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_8.txt @@ -0,0 +1 @@ +Most notably, Mask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on COCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7 mIoU on ADE20K). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mask2former/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mask2former/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..aae3117f07160af446b6fd062a9b2395f03f68e6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mask2former/chunk_9.txt @@ -0,0 +1 @@ +Mask2Former architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..97891f3554addbac030f28405060c66e86f6610e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_10.txt @@ -0,0 +1 @@ +Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..73c2221f6cf01d41efff0ef336c56f04008a51f8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_11.txt @@ -0,0 +1 @@ +The figure below illustrates the architecture of MaskFormer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_12.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2df294db164282f02c6ef60e2222c756d577384 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by francesco. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..be37ae7df479e1583ed28a3f7dd4f8b6eeb61dc7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_15.txt @@ -0,0 +1,3 @@ +Usage tips + +MaskFormer's Transformer decoder is identical to the decoder of DETR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b4e8794c544972a59f56bb6105a99f90d038448 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_16.txt @@ -0,0 +1 @@ +During training, the authors of DETR did find it helpful to use auxiliary losses in the decoder, especially to help the model output the correct number of objects of each class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_17.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b36f281ccd7d03a9e0ada9acc7ce72726cf0350 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_17.txt @@ -0,0 +1 @@ +If you set the parameter use_auxiliary_loss of [MaskFormerConfig] to True, then prediction feedforward neural networks and Hungarian losses are added after each decoder layer (with the FFNs sharing parameters). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_18.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0660306235ea4b1505a872d2abfe27a16fcfae1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_18.txt @@ -0,0 +1,2 @@ +If you want to train the model in a distributed environment across multiple nodes, then one should update the + get_num_masks function inside in the MaskFormerLoss class of modeling_maskformer.py. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_19.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..3882d6f8d7c092f15f5ec229efefb81db3f84854 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_19.txt @@ -0,0 +1,2 @@ +When training on multiple nodes, this should be + set to the average number of target masks across all nodes, as can be seen in the original implementation here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_20.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..bebf4a4d3e4793b925fcebab1bd78e16761c672e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_20.txt @@ -0,0 +1 @@ +One can use [MaskFormerImageProcessor] to prepare images for the model and optional targets for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_21.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6af65a51382b5abce49036a1d1f6d9703c12a60 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_21.txt @@ -0,0 +1 @@ +To get the final segmentation, depending on the task, you can call [~MaskFormerImageProcessor.post_process_semantic_segmentation] or [~MaskFormerImageProcessor.post_process_panoptic_segmentation]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_22.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..58e7c99d28ee8906f60dbf2535970794c4fdfc60 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_22.txt @@ -0,0 +1 @@ +Both tasks can be solved using [MaskFormerForInstanceSegmentation] output, panoptic segmentation accepts an optional label_ids_to_fuse argument to fuse instances of the target object/s (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_23.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..8748f43c1dfe000b767514d7874060aa06935397 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_23.txt @@ -0,0 +1 @@ +sky) together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_24.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..a78fe141cda8ce81ab043c0e304028bf44e7a30e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_24.txt @@ -0,0 +1,3 @@ +Resources + +All notebooks that illustrate inference as well as fine-tuning on custom data with MaskFormer can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_maskformer/chunk_25.txt b/chunked/content_aware_chunking/model_doc_maskformer/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..656da52e7f10ed2f6703e7bc31795ee344b24ba8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_maskformer/chunk_25.txt @@ -0,0 +1,25 @@ +MaskFormer specific outputs +[[autodoc]] models.maskformer.modeling_maskformer.MaskFormerModelOutput +[[autodoc]] models.maskformer.modeling_maskformer.MaskFormerForInstanceSegmentationOutput +MaskFormerConfig +[[autodoc]] MaskFormerConfig +MaskFormerImageProcessor +[[autodoc]] MaskFormerImageProcessor + - preprocess + - encode_inputs + - post_process_semantic_segmentation + - post_process_instance_segmentation + - post_process_panoptic_segmentation +MaskFormerFeatureExtractor +[[autodoc]] MaskFormerFeatureExtractor + - call + - encode_inputs + - post_process_semantic_segmentation + - post_process_instance_segmentation + - post_process_panoptic_segmentation +MaskFormerModel +[[autodoc]] MaskFormerModel + - forward +MaskFormerForInstanceSegmentation +[[autodoc]] MaskFormerForInstanceSegmentation + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_10.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..74013ddbe2a5d735debf33895d5dae02ad7f98c5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_10.txt @@ -0,0 +1 @@ +MatCha is a Visual Question Answering subset of Pix2Struct architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_11.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c516abbe4966b53b1497e49454ffbf38a39575d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_11.txt @@ -0,0 +1 @@ +It renders the input question on the image and predicts the answer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_12.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..03b38c1cf0f77d51be1eb3df1c6c0fb449904837 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_12.txt @@ -0,0 +1,5 @@ +Usage +Currently 6 checkpoints are available for MatCha: + +google/matcha: the base MatCha model, used to fine-tune MatCha on downstream tasks +google/matcha-chartqa: MatCha model fine-tuned on ChartQA dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_13.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..113448ffc281a902157fe886f2ec5d39b375eb38 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_13.txt @@ -0,0 +1 @@ +It can be used to answer questions about charts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_14.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..1eb80af616b8c01821d7f947e681f16530c15e29 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_14.txt @@ -0,0 +1 @@ +google/matcha-plotqa-v1: MatCha model fine-tuned on PlotQA dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_15.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..534976cb928c4f5952869cbe5ee8871306db77bd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_15.txt @@ -0,0 +1 @@ +It can be used to answer questions about plots. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_16.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..192fb7a37027b58b1ec95840a9beb61fe120682d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_16.txt @@ -0,0 +1 @@ +google/matcha-plotqa-v2: MatCha model fine-tuned on PlotQA dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_17.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..534976cb928c4f5952869cbe5ee8871306db77bd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_17.txt @@ -0,0 +1 @@ +It can be used to answer questions about plots. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_18.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..75541338cb792b5c45f7ca36afd49ad897274a6e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_18.txt @@ -0,0 +1 @@ +google/matcha-chart2text-statista: MatCha model fine-tuned on Statista dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_19.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..735316c52fa62fb9569687bef3b41b064670715b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_19.txt @@ -0,0 +1 @@ +google/matcha-chart2text-pew: MatCha model fine-tuned on Pew dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_20.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb2fa0e6381bf74162ee8fefc991aaf0d2bd3b7f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_20.txt @@ -0,0 +1 @@ +The models finetuned on chart2text-pew and chart2text-statista are more suited for summarization, whereas the models finetuned on plotqa and chartqa are more suited for question answering. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_21.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..95d948f7c649897f670ae8444f4a24d449a17f61 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_21.txt @@ -0,0 +1,10 @@ +You can use these models as follows (example on a ChatQA dataset): +thon +from transformers import AutoProcessor, Pix2StructForConditionalGeneration +import requests +from PIL import Image +model = Pix2StructForConditionalGeneration.from_pretrained("google/matcha-chartqa").to(0) +processor = AutoProcessor.from_pretrained("google/matcha-chartqa") +url = "https://raw.githubusercontent.com/vis-nlp/ChartQA/main/ChartQA%20Dataset/val/png/20294671002019.png" +image = Image.open(requests.get(url, stream=True).raw) +inputs = processor(images=image, text="Is the sum of all 4 places greater than Laos? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_22.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..5344c4c77cf86fc2700bdbdd4700e319c88f440b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_22.txt @@ -0,0 +1,6 @@ +", return_tensors="pt").to(0) +predictions = model.generate(**inputs, max_new_tokens=512) +print(processor.decode(predictions[0], skip_special_tokens=True)) + +Fine-tuning +To fine-tune MatCha, refer to the pix2struct fine-tuning notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_23.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbf4d1450d4e69804f969bf73ade5b5e779aba5b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_23.txt @@ -0,0 +1,7 @@ +For Pix2Struct models, we have found out that fine-tuning the model with Adafactor and cosine learning rate scheduler leads to faste convergence: +thon +from transformers.optimization import Adafactor, get_cosine_schedule_with_warmup +optimizer = Adafactor(self.parameters(), scale_parameter=False, relative_step=False, lr=0.01, weight_decay=1e-05) +scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=1000, num_training_steps=40000) + +MatCha is a model that is trained using Pix2Struct architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_24.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f459d4af0391502ac736dcfeac7be5f782281cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_24.txt @@ -0,0 +1 @@ +You can find more information about Pix2Struct in the Pix2Struct documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_matcha/chunk_9.txt b/chunked/content_aware_chunking/model_doc_matcha/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f459d4af0391502ac736dcfeac7be5f782281cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_matcha/chunk_9.txt @@ -0,0 +1 @@ +You can find more information about Pix2Struct in the Pix2Struct documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5b89b1f68897799bd8aaad0c2e01e8905fad665 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_10.txt @@ -0,0 +1,2 @@ +The regular [~MBartTokenizer.__call__] will encode source text format passed as first argument or with the text +keyword, and target text format passed with the text_label keyword argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..1984662de793889c1595bca76d254578e16e59cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_11.txt @@ -0,0 +1,16 @@ +Supervised training + +thon + +from transformers import MBartForConditionalGeneration, MBartTokenizer +tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO") +example_english_phrase = "UN Chief Says There Is No Military Solution in Syria" +expected_translation_romanian = "Åžeful ONU declară că nu există o soluÅ£ie militară în Siria" +inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt") +model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro") +forward pass +model(**inputs) + +Generation + +While generating the target text set the decoder_start_token_id to the target language id. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..aac3cf69b8b413115e7a7519a308e05ef97d04af --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_12.txt @@ -0,0 +1,2 @@ +The following + example shows how to translate English to Romanian using the facebook/mbart-large-en-ro model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..37aff077e9d731c30cc002a10fc99b60bc87e5ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_13.txt @@ -0,0 +1,13 @@ +thon + +from transformers import MBartForConditionalGeneration, MBartTokenizer +tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX") +article = "UN Chief Says There Is No Military Solution in Syria" +inputs = tokenizer(article, return_tensors="pt") +translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"]) +tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] +"Åžeful ONU declară că nu există o soluÅ£ie militară în Siria" + +Overview of MBart-50 +MBart-50 was introduced in the Multilingual Translation with Extensible Multilingual Pretraining and Finetuning paper by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav +Chaudhary, Jiatao Gu, Angela Fan. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..02c1ce0aa06e6835f7ce5392c09e1ff3515f7a5e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_14.txt @@ -0,0 +1,3 @@ +MBart-50 is created using the original mbart-large-cc25 checkpoint by extendeding +its embedding layers with randomly initialized vectors for an extra set of 25 language tokens and then pretrained on 50 +languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbd12d56ff0b26a44718bb117aa4b1ef9473b349 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_15.txt @@ -0,0 +1,2 @@ +According to the abstract +Multilingual translation models can be created through multilingual finetuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..46ba632b344d03fab2ab5955c951834c7f6fbc81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_16.txt @@ -0,0 +1,2 @@ +Instead of finetuning on one +direction, a pretrained model is finetuned on many directions at the same time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_17.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fe1a8db95253f2bca3fe96ca274302111cd73e8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_17.txt @@ -0,0 +1,2 @@ +It demonstrates that pretrained models +can be extended to incorporate additional languages without loss of performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_18.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..26c50913489610b418c5ecb46cc4fb363e7648f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_18.txt @@ -0,0 +1,3 @@ +Multilingual finetuning improves on +average 1 BLEU over the strongest baselines (being either multilingual from scratch or bilingual finetuning) while +improving 9.3 BLEU on average over bilingual baselines from scratch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_19.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a38ef845adfdfb5f120a7009faeb2d954e34d1bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_19.txt @@ -0,0 +1,2 @@ +Training of MBart-50 +The text format for MBart-50 is slightly different from mBART. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_20.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..b66c3b4b6f95a4994fb1bf324572a4f0ead0c033 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_20.txt @@ -0,0 +1,4 @@ +For MBart-50 the language id token is used as a prefix +for both source and target text i.e the text format is [lang_code] X [eos], where lang_code is source +language id for source text and target language id for target text, with X being the source or target text +respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_21.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..e40678ae35128acd3af0372f81c648f201ffb540 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_21.txt @@ -0,0 +1 @@ +MBart-50 has its own tokenizer [MBart50Tokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_22.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4f91bae4acdfb333bc32408f96f88d388b3e8a9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_22.txt @@ -0,0 +1,15 @@ +Supervised training + +thon +from transformers import MBartForConditionalGeneration, MBart50TokenizerFast +model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50") +tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO") +src_text = " UN Chief Says There Is No Military Solution in Syria" +tgt_text = "Åžeful ONU declară că nu există o soluÅ£ie militară în Siria" +model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt") +model(**model_inputs) # forward pass + +Generation + +To generate using the mBART-50 multilingual translation models, eos_token_id is used as the + decoder_start_token_id and the target language id is forced as the first generated token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_23.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..db7fe0032182dc1630a421cd601591b0556b7742 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_23.txt @@ -0,0 +1,2 @@ +To force the + target language id as the first generated token, pass the forced_bos_token_id parameter to the generate method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_24.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..86fa14c3ea5e67fcd52fcd64106b3c4f54a04569 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_24.txt @@ -0,0 +1,2 @@ +The following example shows how to translate between Hindi to French and Arabic to English using the + facebook/mbart-50-large-many-to-many checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_25.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a429b2d833b718eb83e019372509d79fa28abed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_25.txt @@ -0,0 +1,4 @@ +thon +from transformers import MBartForConditionalGeneration, MBart50TokenizerFast +article_hi = "संयà¥à¤•à¥à¤¤ राषà¥à¤Ÿà¥à¤° के पà¥à¤°à¤®à¥à¤– का कहना है कि सीरिया में कोई सैनà¥à¤¯ समाधान नहीं है" +article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري ÙÙŠ سوريا." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_26.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8aa50f94c6f5ae8eb29f4e8be2de74f242d28d6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_26.txt @@ -0,0 +1,8 @@ +model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") +tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") +translate Hindi to French +tokenizer.src_lang = "hi_IN" +encoded_hi = tokenizer(article_hi, return_tensors="pt") +generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"]) +tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +=> "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire en Syria." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_27.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1742e38b28dfca0b47a49c7ccc10046340fccf8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_27.txt @@ -0,0 +1,6 @@ +translate Arabic to English +tokenizer.src_lang = "ar_AR" +encoded_ar = tokenizer(article_ar, return_tensors="pt") +generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]) +tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +=> "The Secretary-General of the United Nations says there is no military solution in Syria." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_28.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..417a38de67d814cb6aa4556812d89dcabbb051ee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_28.txt @@ -0,0 +1,60 @@ +Documentation resources + +Text classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Translation task guide +Summarization task guide + +MBartConfig +[[autodoc]] MBartConfig +MBartTokenizer +[[autodoc]] MBartTokenizer + - build_inputs_with_special_tokens +MBartTokenizerFast +[[autodoc]] MBartTokenizerFast +MBart50Tokenizer +[[autodoc]] MBart50Tokenizer +MBart50TokenizerFast +[[autodoc]] MBart50TokenizerFast + +MBartModel +[[autodoc]] MBartModel +MBartForConditionalGeneration +[[autodoc]] MBartForConditionalGeneration +MBartForQuestionAnswering +[[autodoc]] MBartForQuestionAnswering +MBartForSequenceClassification +[[autodoc]] MBartForSequenceClassification +MBartForCausalLM +[[autodoc]] MBartForCausalLM + - forward + +TFMBartModel +[[autodoc]] TFMBartModel + - call +TFMBartForConditionalGeneration +[[autodoc]] TFMBartForConditionalGeneration + - call + +FlaxMBartModel +[[autodoc]] FlaxMBartModel + - call + - encode + - decode +FlaxMBartForConditionalGeneration +[[autodoc]] FlaxMBartForConditionalGeneration + - call + - encode + - decode +FlaxMBartForSequenceClassification +[[autodoc]] FlaxMBartForSequenceClassification + - call + - encode + - decode +FlaxMBartForQuestionAnswering +[[autodoc]] FlaxMBartForQuestionAnswering + - call + - encode + - decode \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mbart/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mbart/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a35ed805ce3cd7e5960410873ecb9896dd4eb06 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mbart/chunk_9.txt @@ -0,0 +1 @@ +bos is never used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mctct/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mctct/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7ceaeb493cc9730352b292392b9c80c3e2f784e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mctct/chunk_10.txt @@ -0,0 +1,2 @@ +In this work, we extend pseudo-labeling to massively multilingual speech +recognition with 60 languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mctct/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mctct/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f37c78f624bc47d7007c2ccb99ee2a576940247 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mctct/chunk_11.txt @@ -0,0 +1,4 @@ +We propose a simple pseudo-labeling recipe that works well even +with low-resource languages: train a supervised multilingual model, fine-tune it with semi-supervised +learning on a target language, generate pseudo-labels for that language, and train a final model using +pseudo-labels for all languages, either from scratch or by fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mctct/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mctct/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd7e5be1d88d76dc21c07d8d8629353459e9b673 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mctct/chunk_12.txt @@ -0,0 +1,3 @@ +Experiments on the labeled +Common Voice and unlabeled VoxPopuli datasets show that our recipe can yield a model with better +performance for many languages that also transfers well to LibriSpeech. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mctct/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mctct/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..9616dd5a98c0412f2c71174d4cd270c7889d5810 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mctct/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by cwkeam. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mctct/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mctct/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mctct/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mctct/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mctct/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e06ab0a22a64a2f09cfa5b0c572dea76799ad6b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mctct/chunk_15.txt @@ -0,0 +1,2 @@ +Usage tips +The PyTorch version of this model is only available in torch 1.9 and higher. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mctct/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mctct/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd73f832e29930c75613af7687194573a71f380f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mctct/chunk_16.txt @@ -0,0 +1,22 @@ +Resources + +Automatic speech recognition task guide + +MCTCTConfig +[[autodoc]] MCTCTConfig +MCTCTFeatureExtractor +[[autodoc]] MCTCTFeatureExtractor + - call +MCTCTProcessor +[[autodoc]] MCTCTProcessor + - call + - from_pretrained + - save_pretrained + - batch_decode + - decode +MCTCTModel +[[autodoc]] MCTCTModel + - forward +MCTCTForCTC +[[autodoc]] MCTCTForCTC + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mctct/chunk_6.txt b/chunked/content_aware_chunking/model_doc_mctct/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c34bfa308e157b5c443f02b5aa3536341640af7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mctct/chunk_6.txt @@ -0,0 +1 @@ +After training on Common Voice and VoxPopuli, the model is trained on Common Voice only. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mctct/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mctct/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e30e2e1fcf796aa0d291b89fb12c43bc8114fa0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mctct/chunk_7.txt @@ -0,0 +1 @@ +The labels are unnormalized character-level transcripts (punctuation and capitalization are not removed). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mctct/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mctct/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b43a7f67496d10dd4f6137bd2e52129e6d63a232 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mctct/chunk_8.txt @@ -0,0 +1 @@ +The model takes as input Mel filterbank features from a 16Khz audio signal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mctct/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mctct/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..1634bb08cbbbf74d409cab6caf8c4a3429fb30e7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mctct/chunk_9.txt @@ -0,0 +1,3 @@ +The abstract from the paper is the following: +Semi-supervised learning through pseudo-labeling has become a staple of state-of-the-art monolingual +speech recognition systems. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mega/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mega/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..9144292927649323e8cb7b91abd247224f746c9c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mega/chunk_10.txt @@ -0,0 +1,3 @@ +Usage tips + +MEGA can perform quite well with relatively few parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mega/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mega/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..9dbf20ab565d53c51e0a51adb606717b7b89a847 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mega/chunk_11.txt @@ -0,0 +1 @@ +See Appendix D in the MEGA paper for examples of architectural specs which perform well in various settings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mega/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mega/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5f543eead4a7d9942917b964d7ce4c6e0d04b7f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mega/chunk_12.txt @@ -0,0 +1 @@ +If using MEGA as a decoder, be sure to set bidirectional=False to avoid errors with default bidirectional. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mega/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mega/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..36887956725328bac473e185ba0dca2a925a1ae4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mega/chunk_13.txt @@ -0,0 +1 @@ +Mega-chunk is a variant of mega that reduces time and spaces complexity from quadratic to linear. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mega/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mega/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7293872785b5e9d3d362ad11833ae55f4681ed1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mega/chunk_14.txt @@ -0,0 +1,5 @@ +Utilize chunking with MegaConfig.use_chunking and control chunk size with MegaConfig.chunk_size + +Implementation Notes + +The original implementation of MEGA had an inconsistent expectation of attention masks for padding and causal self-attention between the softmax attention and Laplace/squared ReLU method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mega/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mega/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..9caec3ba371d84fee2e15515f38235f175490432 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mega/chunk_15.txt @@ -0,0 +1 @@ +This implementation addresses that inconsistency. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mega/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mega/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb2802c1ce8d0560ecdca6dac154b67533177543 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mega/chunk_16.txt @@ -0,0 +1,25 @@ +The original implementation did not include token type embeddings; this implementation adds support for these, with the option controlled by MegaConfig.add_token_type_embeddings + +MegaConfig +[[autodoc]] MegaConfig +MegaModel +[[autodoc]] MegaModel + - forward +MegaForCausalLM +[[autodoc]] MegaForCausalLM + - forward +MegaForMaskedLM +[[autodoc]] MegaForMaskedLM + - forward +MegaForSequenceClassification +[[autodoc]] MegaForSequenceClassification + - forward +MegaForMultipleChoice +[[autodoc]] MegaForMultipleChoice + - forward +MegaForTokenClassification +[[autodoc]] MegaForTokenClassification + - forward +MegaForQuestionAnswering +[[autodoc]] MegaForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mega/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mega/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4a8f81cd940758106db066fa2939921dda79237 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mega/chunk_7.txt @@ -0,0 +1 @@ +Extensive experiments on a wide range of sequence modeling benchmarks, including the Long Range Arena, neural machine translation, auto-regressive language modeling, and image and speech classification, show that Mega achieves significant improvements over other sequence models, including variants of Transformers and recent state space models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mega/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mega/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac1995e982351486922e614bbb64c1c604120fc7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mega/chunk_8.txt @@ -0,0 +1,2 @@ +* +This model was contributed by mnaylor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mega/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mega/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mega/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb56912f9ecb6b2ac0ae598ab81477485d1fb3b5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_10.txt @@ -0,0 +1,2 @@ +Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy +of 89.4%). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..524d6c512e67a8c093b9dbdfe91de0b3ba06126e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by jdemouth. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b5b0d2adaf51cb7f0b2ab9d84386d145377706a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_13.txt @@ -0,0 +1 @@ +That repository contains a multi-GPU and multi-node implementation of the Megatron Language models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_14.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2b8dd6aee20c3e309366d2a1ecd9aa1d25ec04d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_14.txt @@ -0,0 +1,2 @@ +In particular, +it contains a hybrid model parallel approach using "tensor parallel" and "pipeline parallel" techniques. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_15.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..521a4c0d491a2842419e4f6aeb4d8f29def8caa9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_15.txt @@ -0,0 +1,3 @@ +Usage tips +We have provided pretrained BERT-345M checkpoints +for use to evaluate or finetuning downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_16.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..77711555ca8625106ab65d7a3248dc650146177f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_16.txt @@ -0,0 +1,2 @@ +To access these checkpoints, first sign up for and setup the NVIDIA GPU Cloud (NGC) +Registry CLI. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_17.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7b522fc923578fefa1696825b3db2d010d7ab20 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_17.txt @@ -0,0 +1 @@ +Further documentation for downloading models can be found in the NGC documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_18.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..3da0cfc506c2c342715b2f2264b9d9326664642b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_18.txt @@ -0,0 +1,11 @@ +Alternatively, you can directly download the checkpoints using: +BERT-345M-uncased: + +wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_uncased/zip +-O megatron_bert_345m_v0_1_uncased.zip +BERT-345M-cased: + +wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_bert_345m/versions/v0.1_cased/zip -O +megatron_bert_345m_v0_1_cased.zip +Once you have obtained the checkpoints from NVIDIA GPU Cloud (NGC), you have to convert them to a format that will +easily be loaded by Hugging Face Transformers and our port of the BERT code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_19.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..865d5949f4761f70a95c8c89801390bb1fedd90e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_19.txt @@ -0,0 +1 @@ +The following commands allow you to do the conversion. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_20.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..a543f65f664382ac8da91845bd372843bda977a9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_20.txt @@ -0,0 +1,44 @@ +We assume that the folder models/megatron_bert contains +megatron_bert_345m_v0_1_{cased, uncased}.zip and that the commands are run from inside that folder: + +python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_uncased.zip + +python3 $PATH_TO_TRANSFORMERS/models/megatron_bert/convert_megatron_bert_checkpoint.py megatron_bert_345m_v0_1_cased.zip +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +MegatronBertConfig +[[autodoc]] MegatronBertConfig +MegatronBertModel +[[autodoc]] MegatronBertModel + - forward +MegatronBertForMaskedLM +[[autodoc]] MegatronBertForMaskedLM + - forward +MegatronBertForCausalLM +[[autodoc]] MegatronBertForCausalLM + - forward +MegatronBertForNextSentencePrediction +[[autodoc]] MegatronBertForNextSentencePrediction + - forward +MegatronBertForPreTraining +[[autodoc]] MegatronBertForPreTraining + - forward +MegatronBertForSequenceClassification +[[autodoc]] MegatronBertForSequenceClassification + - forward +MegatronBertForMultipleChoice +[[autodoc]] MegatronBertForMultipleChoice + - forward +MegatronBertForTokenClassification +[[autodoc]] MegatronBertForTokenClassification + - forward +MegatronBertForQuestionAnswering +[[autodoc]] MegatronBertForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..84ec4e6cd66e8ae9adfbd4d649a950b8e8b205f3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_8.txt @@ -0,0 +1,2 @@ +We show that careful attention to the placement of layer normalization in +BERT-like models is critical to achieving increased performance as the model size grows. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..960f0f60d76be40b6fa003a530d77f6c9ad0e483 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron-bert/chunk_9.txt @@ -0,0 +1,3 @@ +Using the GPT-2 model we +achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA +accuracy of 63.2%) datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb56912f9ecb6b2ac0ae598ab81477485d1fb3b5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_10.txt @@ -0,0 +1,2 @@ +Our BERT model achieves SOTA results on the RACE dataset (90.9% compared to SOTA accuracy +of 89.4%). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..524d6c512e67a8c093b9dbdfe91de0b3ba06126e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by jdemouth. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b5b0d2adaf51cb7f0b2ab9d84386d145377706a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_13.txt @@ -0,0 +1 @@ +That repository contains a multi-GPU and multi-node implementation of the Megatron Language models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ea36ec7884a44c7dc3c8b0d88d37aa5ccb808d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_14.txt @@ -0,0 +1,2 @@ +In particular, it +contains a hybrid model parallel approach using "tensor parallel" and "pipeline parallel" techniques. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..11de990bcbedc67e471097fb384d950c92edd2b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_15.txt @@ -0,0 +1,3 @@ +Usage tips +We have provided pretrained GPT2-345M checkpoints +for use to evaluate or finetuning downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..77711555ca8625106ab65d7a3248dc650146177f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_16.txt @@ -0,0 +1,2 @@ +To access these checkpoints, first sign up for and setup the NVIDIA GPU Cloud (NGC) +Registry CLI. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_17.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7b522fc923578fefa1696825b3db2d010d7ab20 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_17.txt @@ -0,0 +1 @@ +Further documentation for downloading models can be found in the NGC documentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_18.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..d46be87d1e9f89051819c83eb328a0389ef3c448 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_18.txt @@ -0,0 +1,6 @@ +Alternatively, you can directly download the checkpoints using: + +wget --content-disposition https://api.ngc.nvidia.com/v2/models/nvidia/megatron_lm_345m/versions/v0.0/zip -O +megatron_gpt2_345m_v0_0.zip +Once you have obtained the checkpoint from NVIDIA GPU Cloud (NGC), you have to convert it to a format that will easily +be loaded by Hugging Face Transformers GPT2 implementation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_19.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..319e75d8b8a80daea86e232a2e12012c24f742cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_19.txt @@ -0,0 +1 @@ +The following command allows you to do the conversion. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_20.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..25f5f9198c45d2dc345cb2812fb33b2d05d68e4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_20.txt @@ -0,0 +1,6 @@ +We assume that the folder models/megatron_gpt2 contains +megatron_gpt2_345m_v0_0.zip and that the command is run from that folder: + +python3 $PATH_TO_TRANSFORMERS/models/megatron_gpt2/convert_megatron_gpt2_checkpoint.py megatron_gpt2_345m_v0_0.zip + +MegatronGPT2 architecture is the same as OpenAI GPT-2 . \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_21.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..83f06a67278f90a524037b3ecc4c48a3f3053e20 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_21.txt @@ -0,0 +1,2 @@ +Refer to GPT-2 documentation for information on + configuration classes and their parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..84ec4e6cd66e8ae9adfbd4d649a950b8e8b205f3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_8.txt @@ -0,0 +1,2 @@ +We show that careful attention to the placement of layer normalization in +BERT-like models is critical to achieving increased performance as the model size grows. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..960f0f60d76be40b6fa003a530d77f6c9ad0e483 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_megatron_gpt2/chunk_9.txt @@ -0,0 +1,3 @@ +Using the GPT-2 model we +achieve SOTA results on the WikiText103 (10.8 compared to SOTA perplexity of 15.8) and LAMBADA (66.5% compared to SOTA +accuracy of 63.2%) datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dfb1a19416f4675438bb3efe31cee982cfd93f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_10.txt @@ -0,0 +1 @@ +MGP-STR architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_11.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d5cb8f7e6d2424009b3cb81e72c3454ad5feaf2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_12.txt @@ -0,0 +1 @@ +MGP-STR is trained on two synthetic datasets MJSynth (MJ) and SynthText (ST) without fine-tuning on other datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..17c4dc748254b4c67db1e662fa5b5eb62e8254af --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_13.txt @@ -0,0 +1 @@ +It achieves state-of-the-art results on six standard Latin scene text benchmarks, including 3 regular text datasets (IC13, SVT, IIIT) and 3 irregular ones (IC15, SVTP, CUTE). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..91d91651ea78e757fc2818f82ba900b315d28db4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_14.txt @@ -0,0 +1 @@ +This model was contributed by yuekun. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_15.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..23f2261f01e863c943e8675025163737c58ce44e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_16.txt @@ -0,0 +1,2 @@ +Inference example +[MgpstrModel] accepts images as input and generates three types of predictions, which represent textual information at different granularities. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_17.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..9743767d4b3b77e66443fcda57bfc08afad609c9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_17.txt @@ -0,0 +1 @@ +The three types of predictions are fused to give the final prediction result. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_18.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..d73da200af311c0d442b3fa50c5551a8d033c887 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_18.txt @@ -0,0 +1,2 @@ +The [ViTImageProcessor] class is responsible for preprocessing the input image and +[MgpstrTokenizer] decodes the generated character tokens to the target string. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_19.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ed7bf4737ac8a06a27259aba1e029ab175aae6f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_19.txt @@ -0,0 +1,3 @@ +The +[MgpstrProcessor] wraps [ViTImageProcessor] and [MgpstrTokenizer] +into a single instance to both extract the input features and decode the predicted token ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_20.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..19cc5314d6e5b8c9ab602cf3eacfe7e5ae0622e4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_20.txt @@ -0,0 +1,29 @@ +Step-by-step Optical Character Recognition (OCR) + +from transformers import MgpstrProcessor, MgpstrForSceneTextRecognition +import requests +from PIL import Image +processor = MgpstrProcessor.from_pretrained('alibaba-damo/mgp-str-base') +model = MgpstrForSceneTextRecognition.from_pretrained('alibaba-damo/mgp-str-base') +load image from the IIIT-5k dataset +url = "https://i.postimg.cc/ZKwLg2Gw/367-14.png" +image = Image.open(requests.get(url, stream=True).raw).convert("RGB") +pixel_values = processor(images=image, return_tensors="pt").pixel_values +outputs = model(pixel_values) +generated_text = processor.batch_decode(outputs.logits)['generated_text'] + +MgpstrConfig +[[autodoc]] MgpstrConfig +MgpstrTokenizer +[[autodoc]] MgpstrTokenizer + - save_vocabulary +MgpstrProcessor +[[autodoc]] MgpstrProcessor + - call + - batch_decode +MgpstrModel +[[autodoc]] MgpstrModel + - forward +MgpstrForSceneTextRecognition +[[autodoc]] MgpstrForSceneTextRecognition + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c89066af868a4ae47976f973adb308314cb9cc0f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_7.txt @@ -0,0 +1 @@ +, subword representations (BPE and WordPiece) widely-used in NLP are introduced into the output space, in addition to the conventional character level representation, while no independent language model (LM) is adopted. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..28914be2099ca0958ed0756492e34929dfb5611d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_8.txt @@ -0,0 +1 @@ +The resultant algorithm (termed MGP-STR) is able to push the performance envelop of STR to an even higher level. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mgp-str/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b75083aa361700136e36c6e57887e229427778e7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mgp-str/chunk_9.txt @@ -0,0 +1 @@ +Specifically, it achieves an average recognition accuracy of 93.35% on standard benchmarks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mistral/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mistral/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..564f6f90b816d7b6ede21c61aa918c7062e8a680 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mistral/chunk_10.txt @@ -0,0 +1,2 @@ +Sliding window Attention +The current implementation supports the sliding window attention mechanism and memory efficient cache management. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mistral/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mistral/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfa21c378d3097b6dcf45077744c4672f2998d12 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mistral/chunk_11.txt @@ -0,0 +1 @@ +To enable sliding window attention, just make sure to have a flash-attn version that is compatible with sliding window attention (>=2.3.0). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mistral/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mistral/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..88bdc1d028ee614c83edcb15af1d3c955d03999d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mistral/chunk_12.txt @@ -0,0 +1 @@ +The Flash Attention-2 model uses also a more memory efficient cache slicing mechanism - as recommended per the official implementation of Mistral model that use rolling cache mechanism we keep the cache size fixed (self.config.sliding_window), support batched generation only for padding_side="left" and use the absolute position of the current token to compute the positional embedding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mistral/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mistral/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..17fcc8699a8bd5591605b5fd122e1b723f2033c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mistral/chunk_13.txt @@ -0,0 +1,2 @@ +The Mistral Team +Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mistral/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mistral/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..395b3fc533b7326a4b565199b925a62dc8605a7f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mistral/chunk_14.txt @@ -0,0 +1,17 @@ +MistralConfig +[[autodoc]] MistralConfig +MistralModel +[[autodoc]] MistralModel + - forward +MistralForCausalLM +[[autodoc]] MistralForCausalLM + - forward +MistralForSequenceClassification +[[autodoc]] MistralForSequenceClassification + - forward +FlaxMistralModel +[[autodoc]] FlaxMistralModel + - call +FlaxMistralForCausalLM +[[autodoc]] FlaxMistralForCausalLM + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mistral/chunk_6.txt b/chunked/content_aware_chunking/model_doc_mistral/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..43c8befe42aecbcd19b04130e166207c73a46264 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mistral/chunk_6.txt @@ -0,0 +1,2 @@ +pip install -U flash-attn --no-build-isolation +Make also sure that you have a hardware that is compatible with Flash-Attention 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mistral/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mistral/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f54478ededdb4c998599671bfd287599f84cc76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mistral/chunk_7.txt @@ -0,0 +1 @@ +Read more about it in the official documentation of flash-attn repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mistral/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mistral/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ef85380a7fd818d955330819ccf432ff686d273 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mistral/chunk_8.txt @@ -0,0 +1 @@ +Make also sure to load your model in half-precision (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mistral/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mistral/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..330217582893febfa5e710c3d02615eeeb601dbd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mistral/chunk_9.txt @@ -0,0 +1,18 @@ +torch.float16) +To load and run a model using Flash Attention 2, refer to the snippet below: +thon + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +device = "cuda" # the device to load the model onto +model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16, attn_implementation="flash_attention_2") +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") +prompt = "My favourite condiment is" +model_inputs = tokenizer([prompt], return_tensors="pt").to(device) +model.to(device) +generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True) +tokenizer.batch_decode(generated_ids)[0] +"The expected output" + +Expected speedups +Below is a expected speedup diagram that compares pure inference time between the native implementation in transformers using mistralai/Mistral-7B-v0.1 checkpoint and the Flash Attention 2 version of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b6835832a283f2d8322e3a66227ee845807f165 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_11.txt @@ -0,0 +1,4 @@ +Model Details +Mixtral-45B is a decoder-based LM with the following architectural choices: + +Mixtral is a Mixture of Expert (MOE) model with 8 experts per MLP, with a total of 45B paramateres but the compute required is the same as a 14B model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e7e441732019cba23487c0d505c5b6fd5a9b823 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_12.txt @@ -0,0 +1 @@ +This is because even though each experts have to be loaded in RAM (70B like ram requirement) each token from the hidden states are dispatched twice (top 2 routing) and thus the compute (the operation required at each forward computation) is just 2 X sequence_length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6dcf7fa19c0117864d21dca18f9ceb31e2f901a9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_13.txt @@ -0,0 +1,3 @@ +The following implementation details are shared with Mistral AI's first model mistral: +* Sliding Window Attention - Trained with 8k context length and fixed cache size, with a theoretical attention span of 128K tokens +* GQA (Grouped Query Attention) - allowing faster inference and lower cache size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca17197b2831ededae5c5cce47ef39f86e169407 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_14.txt @@ -0,0 +1 @@ +* Byte-fallback BPE tokenizer - ensures that characters are never mapped to out of vocabulary tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..f90d1828cd6d7d532251a6a1a7e9847ae1be09bc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_15.txt @@ -0,0 +1 @@ +They also provide an instruction fine-tuned model: mistralai/Mixtral-8x7B-v0.1 which can be used for chat-based inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..55f46d4d40c6f5a340bae8f5cec183f7345a9977 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_16.txt @@ -0,0 +1,3 @@ +For more details please read our release blog post +License +Mixtral-8x7B is released under the Apache 2.0 license. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_17.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..df1dd9e673b9c933af2ffce1dbdaa04baea71b2d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_17.txt @@ -0,0 +1,28 @@ +Usage tips +Mixtral-8x7B can be found on the Huggingface Hub +These ready-to-use checkpoints can be downloaded and used via the HuggingFace Hub: +thon + +from transformers import AutoModelForCausalLM, AutoTokenizer +device = "cuda" # the device to load the model onto +model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1") +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1") +prompt = "My favourite condiment is" +model_inputs = tokenizer([prompt], return_tensors="pt").to(device) +model.to(device) +generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True) +tokenizer.batch_decode(generated_ids)[0] +"The expected output" + +To use the raw checkpoints with HuggingFace you can use the convert_mixtral_weights_to_hf.py script to convert them to the HuggingFace format: + +python src/transformers/models/mixtral/convert_mixtral_weights_to_hf.py \ + --input_dir /path/to/downloaded/mistral/weights --output_dir /output/path +You can then load the converted model from the output/path: +thon +from transformers import MixtralForCausalLM, LlamaTokenizer +tokenizer = LlamaTokenizer.from_pretrained("/output/path") +model = MixtralForCausalLM.from_pretrained("/output/path") + +Combining Mixtral and Flash Attention 2 +First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_18.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..43c8befe42aecbcd19b04130e166207c73a46264 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_18.txt @@ -0,0 +1,2 @@ +pip install -U flash-attn --no-build-isolation +Make also sure that you have a hardware that is compatible with Flash-Attention 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_19.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f54478ededdb4c998599671bfd287599f84cc76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_19.txt @@ -0,0 +1 @@ +Read more about it in the official documentation of flash-attn repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_20.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ef85380a7fd818d955330819ccf432ff686d273 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_20.txt @@ -0,0 +1 @@ +Make also sure to load your model in half-precision (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_21.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..85c8b64025c4ea1e4407752cb80891c5afaee5f2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_21.txt @@ -0,0 +1,18 @@ +torch.float16) +To load and run a model using Flash Attention 2, refer to the snippet below: +thon + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +device = "cuda" # the device to load the model onto +model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-v0.1", torch_dtype=torch.float16, attn_implementation="flash_attention_2") +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mixtral-8x7B-v0.1") +prompt = "My favourite condiment is" +model_inputs = tokenizer([prompt], return_tensors="pt").to(device) +model.to(device) +generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True) +tokenizer.batch_decode(generated_ids)[0] +"The expected output" + +Expected speedups +Below is a expected speedup diagram that compares pure inference time between the native implementation in transformers using mistralai/Mixtral-8x7B-v0.1 checkpoint and the Flash Attention 2 version of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_22.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..564f6f90b816d7b6ede21c61aa918c7062e8a680 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_22.txt @@ -0,0 +1,2 @@ +Sliding window Attention +The current implementation supports the sliding window attention mechanism and memory efficient cache management. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_23.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfa21c378d3097b6dcf45077744c4672f2998d12 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_23.txt @@ -0,0 +1 @@ +To enable sliding window attention, just make sure to have a flash-attn version that is compatible with sliding window attention (>=2.3.0). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_24.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..88bdc1d028ee614c83edcb15af1d3c955d03999d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_24.txt @@ -0,0 +1 @@ +The Flash Attention-2 model uses also a more memory efficient cache slicing mechanism - as recommended per the official implementation of Mistral model that use rolling cache mechanism we keep the cache size fixed (self.config.sliding_window), support batched generation only for padding_side="left" and use the absolute position of the current token to compute the positional embedding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_25.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..17fcc8699a8bd5591605b5fd122e1b723f2033c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_25.txt @@ -0,0 +1,2 @@ +The Mistral Team +Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_26.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..f71f9e0eba6c00ef752e82e36d760493f77f45bd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_26.txt @@ -0,0 +1,11 @@ +MixtralConfig +[[autodoc]] MixtralConfig +MixtralModel +[[autodoc]] MixtralModel + - forward +MixtralForCausalLM +[[autodoc]] MixtralForCausalLM + - forward +MixtralForSequenceClassification +[[autodoc]] MixtralForSequenceClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mixtral/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mixtral/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..11bce26a4120d3610adea20c4b0f83237bc5fe1d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mixtral/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by Younes Belkada and Arthur Zucker . \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mluke/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mluke/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..97a406bf307c1e7b0c8ba866f8cbbef7601f2fd6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mluke/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by ryo0634. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mluke/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mluke/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mluke/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mluke/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mluke/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..43eb2e5eb56ba474a0b9ec0bf8f51a4fbe0374a4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mluke/chunk_12.txt @@ -0,0 +1,7 @@ +Usage tips +One can directly plug in the weights of mLUKE into a LUKE model, like so: +thon +from transformers import LukeModel +model = LukeModel.from_pretrained("studio-ousia/mluke-base") + +Note that mLUKE has its own tokenizer, [MLukeTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mluke/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mluke/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..7cb7887cfc7657dd5663c2f0457bf33cad25c364 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mluke/chunk_13.txt @@ -0,0 +1,7 @@ +You can initialize it as follows: +thon +from transformers import MLukeTokenizer +tokenizer = MLukeTokenizer.from_pretrained("studio-ousia/mluke-base") + +As mLUKE's architecture is equivalent to that of LUKE, one can refer to LUKE's documentation page for all +tips, code examples and notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mluke/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mluke/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf2382b62a22187698579d0593bb821d1d0d9cdf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mluke/chunk_14.txt @@ -0,0 +1,4 @@ +MLukeTokenizer +[[autodoc]] MLukeTokenizer + - call + - save_vocabulary \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mluke/chunk_6.txt b/chunked/content_aware_chunking/model_doc_mluke/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..06b287019faa009af347807c395f70e7789327c3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mluke/chunk_6.txt @@ -0,0 +1,3 @@ +We train a multilingual language model with 24 languages +with entity representations and show the model consistently outperforms word-based pretrained models in various +cross-lingual transfer tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mluke/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mluke/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e8366e680ebeca87c528ebf18b866dff830ff7f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mluke/chunk_7.txt @@ -0,0 +1,2 @@ +We also analyze the model and the key insight is that incorporating entity +representations into the input allows us to extract more language-agnostic features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mluke/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mluke/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b60af6663496da85953b5630515a6c91200c7eb6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mluke/chunk_8.txt @@ -0,0 +1,2 @@ +We also evaluate the model with a +multilingual cloze prompt task with the mLAMA dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mluke/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mluke/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..20e3d4080e00c6358254f30b98492c78ec2bd16e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mluke/chunk_9.txt @@ -0,0 +1,2 @@ +We show that entity-based prompt elicits correct factual +knowledge more likely than using only word representations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_18.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..af19e84d5592e573e40a56385cd4f45ae5f10956 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_18.txt @@ -0,0 +1,2 @@ +The ignore_mismatched_sizes=True keyword has to be passed to allow the language model head to be resized according +to the vocabulary of the specified language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_19.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..1207cb016a27a6aaf795c9284de1f52dafb3cdf7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_19.txt @@ -0,0 +1,14 @@ +Similarly, the processor should be loaded with the same target language + +from transformers import Wav2Vec2ForCTC, AutoProcessor +model_id = "facebook/mms-1b-all" +target_lang = "fra" +processor = AutoProcessor.from_pretrained(model_id, target_lang=target_lang) +model = Wav2Vec2ForCTC.from_pretrained(model_id, target_lang=target_lang, ignore_mismatched_sizes=True) + +You can safely ignore a warning such as: +text +Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/mms-1b-all and are newly initialized because the shapes did not match: +- lm_head.bias: found shape torch.Size([154]) in the checkpoint and torch.Size([314]) in the model instantiated +- lm_head.weight: found shape torch.Size([154, 1280]) in the checkpoint and torch.Size([314, 1280]) in the model instantiated +You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_20.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b73c67b6289d15c5c5748f0acd421aae64a9598 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_20.txt @@ -0,0 +1,10 @@ +If you want to use the ASR pipeline, you can load your chosen target language as such: + +from transformers import pipeline +model_id = "facebook/mms-1b-all" +target_lang = "fra" +pipe = pipeline(model=model_id, model_kwargs={"target_lang": "fra", "ignore_mismatched_sizes": True}) + +Inference +Next, let's look at how we can run MMS in inference and change adapter layers after having called [~PretrainedModel.from_pretrained] +First, we load audio data in different languages using the Datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_21.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..e32de02aaef4718856578b431af00026e8e2428d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_21.txt @@ -0,0 +1,20 @@ +from datasets import load_dataset, Audio +English +stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="test", streaming=True) +stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000)) +en_sample = next(iter(stream_data))["audio"]["array"] +French +stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "fr", split="test", streaming=True) +stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000)) +fr_sample = next(iter(stream_data))["audio"]["array"] + +Next, we load the model and processor + +from transformers import Wav2Vec2ForCTC, AutoProcessor +import torch +model_id = "facebook/mms-1b-all" +processor = AutoProcessor.from_pretrained(model_id) +model = Wav2Vec2ForCTC.from_pretrained(model_id) + +Now we process the audio data, pass the processed audio data to the model and transcribe the model output, +just like we usually do for [Wav2Vec2ForCTC]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_22.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7fe1b343684840d3aceca8822da322de318066f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_22.txt @@ -0,0 +1,9 @@ +inputs = processor(en_sample, sampling_rate=16_000, return_tensors="pt") +with torch.no_grad(): + outputs = model(**inputs).logits +ids = torch.argmax(outputs, dim=-1)[0] +transcription = processor.decode(ids) +'joe keton disapproved of films and buster also had reservations about the media' + +We can now keep the same model in memory and simply switch out the language adapters by +calling the convenient [~Wav2Vec2ForCTC.load_adapter] function for the model and [~Wav2Vec2CTCTokenizer.set_target_lang] for the tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_23.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..35b0cd911b51dfbf2dffc6d5ea860a03e2ed83fe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_23.txt @@ -0,0 +1 @@ +We pass the target language as an input - "fra" for French. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_24.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0ce4c3870b0cf877a0fbbccfdd7a4fbd523b3ef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_24.txt @@ -0,0 +1,10 @@ +processor.tokenizer.set_target_lang("fra") +model.load_adapter("fra") +inputs = processor(fr_sample, sampling_rate=16_000, return_tensors="pt") +with torch.no_grad(): + outputs = model(**inputs).logits +ids = torch.argmax(outputs, dim=-1)[0] +transcription = processor.decode(ids) +"ce dernier est volé tout au long de l'histoire romaine" + +In the same way the language can be switched out for all other supported languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_25.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cee48d0eb656c4e18619066f8029cf9d5c085af --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_25.txt @@ -0,0 +1,4 @@ +Please have a look at: +py +processor.tokenizer.vocab.keys() +to see all supported languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_26.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb45f6a0d2181fb22f1ca1a422c7636b8bf4916a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_26.txt @@ -0,0 +1 @@ +To further improve performance from ASR models, language model decoding can be used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_27.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..4702bcf4aa8d9ee103d4bc477bacb6043cc28826 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_27.txt @@ -0,0 +1 @@ +See the documentation here for further details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_28.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..78bf4a58c6333b28a55e69c047be178ddf8066cd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_28.txt @@ -0,0 +1,2 @@ +Speech Synthesis (TTS) +MMS-TTS uses the same model architecture as VITS, which was added to 🤗 Transformers in v4.33. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_29.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..1aef3fc83e59efdae3044cac172b7b4e17aee560 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_29.txt @@ -0,0 +1,2 @@ +MMS trains a separate +model checkpoint for each of the 1100+ languages in the project. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_30.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2bbf6b1457435735b66e6a0baf789ccedbbdeb1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_30.txt @@ -0,0 +1,3 @@ +All available checkpoints can be found on the Hugging +Face Hub: facebook/mms-tts, and the inference +documentation under VITS. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_31.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..799508a5735b3bb5f12a27fd89e003524d003c1a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_31.txt @@ -0,0 +1,6 @@ +Inference +To use the MMS model, first update to the latest version of the Transformers library: + +pip install --upgrade transformers accelerate +Since the flow-based model in VITS is non-deterministic, it is good practice to set a seed to ensure reproducibility of +the outputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_32.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bf37ec6b87d85b8b6308b723753765f2fae5ae0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_32.txt @@ -0,0 +1,2 @@ +For languages with a Roman alphabet, such as English or French, the tokenizer can be used directly to +pre-process the text inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_33.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6b66ac241c6fa52094230af64cb071c44d8c79e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_33.txt @@ -0,0 +1,25 @@ +The following code example runs a forward pass using the MMS-TTS English checkpoint: + +thon +import torch +from transformers import VitsTokenizer, VitsModel, set_seed +tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") +model = VitsModel.from_pretrained("facebook/mms-tts-eng") +inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt") +set_seed(555) # make deterministic +with torch.no_grad(): + outputs = model(**inputs) +waveform = outputs.waveform[0] + +The resulting waveform can be saved as a .wav file: +thon +import scipy +scipy.io.wavfile.write("synthesized_speech.wav", rate=model.config.sampling_rate, data=waveform) + +Or displayed in a Jupyter Notebook / Google Colab: +thon +from IPython.display import Audio +Audio(waveform, rate=model.config.sampling_rate) + +For certain languages with non-Roman alphabets, such as Arabic, Mandarin or Hindi, the uroman +perl package is required to pre-process the text inputs to the Roman alphabet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_34.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cb34d4e1a9d3e1a605bf2d904d5886c7120fe62 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_34.txt @@ -0,0 +1,9 @@ +You can check whether you require the uroman package for your language by inspecting the is_uroman attribute of +the pre-trained tokenizer: +thon +from transformers import VitsTokenizer +tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") +print(tokenizer.is_uroman) + +If required, you should apply the uroman package to your text inputs prior to passing them to the VitsTokenizer, +since currently the tokenizer does not support performing the pre-processing itself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_35.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e4f6c6ba86ae21a00410dcc98081e44d4eaa39e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_35.txt @@ -0,0 +1,6 @@ +To do this, first clone the uroman repository to your local machine and set the bash variable UROMAN to the local path: + +git clone https://github.com/isi-nlp/uroman.git +cd uroman +export UROMAN=$(pwd) +You can then pre-process the text input using the following code snippet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_36.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2ee47f99541641824e79977207342b156f7d8d0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_36.txt @@ -0,0 +1,11 @@ +You can either rely on using the bash variable +UROMAN to point to the uroman repository, or you can pass the uroman directory as an argument to the uromaize function: +thon +import torch +from transformers import VitsTokenizer, VitsModel, set_seed +import os +import subprocess +tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-kor") +model = VitsModel.from_pretrained("facebook/mms-tts-kor") +def uromanize(input_string, uroman_path): + """Convert non-Roman strings to Roman using the uroman perl package.""" \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_37.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7e89d7a2ec684ba16e7b11bb67ad66e5d915621 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_37.txt @@ -0,0 +1,24 @@ +script_path = os.path.join(uroman_path, "bin", "uroman.pl") +command = ["perl", script_path] + +process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) +# Execute the perl command +stdout, stderr = process.communicate(input=input_string.encode()) + +if process.returncode != 0: + raise ValueError(f"Error {process.returncode}: {stderr.decode()}") + +# Return the output as a string and skip the new-line character at the end +return stdout.decode()[:-1] + +text = "ì´ë´ 무슨 ì¼ì´ì•¼" +uromaized_text = uromanize(text, uroman_path=os.environ["UROMAN"]) +inputs = tokenizer(text=uromaized_text, return_tensors="pt") +set_seed(555) # make deterministic +with torch.no_grad(): + outputs = model(inputs["input_ids"]) +waveform = outputs.waveform[0] + +Tips: + +The MMS-TTS checkpoints are trained on lower-cased, un-punctuated text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_38.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..c493d7e82f05586b11b2ca54e5e1f2bc3e70140a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_38.txt @@ -0,0 +1 @@ +By default, the VitsTokenizer normalizes the inputs by removing any casing and punctuation, to avoid passing out-of-vocabulary characters to the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_39.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..241e284d35aed3c0ea515c4151a60072e2672ed9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_39.txt @@ -0,0 +1 @@ +Hence, the model is agnostic to casing and punctuation, so these should be avoided in the text prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_40.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..56671b78e16c2af162c49abd009c7a1228c4505c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_40.txt @@ -0,0 +1 @@ +You can disable normalisation by setting normalize=False in the call to the tokenizer, but this will lead to un-expected behaviour and is discouraged. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_41.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..b447b1c8742c082c00e60267b0695fc59cef9dc7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_41.txt @@ -0,0 +1 @@ +The speaking rate can be varied by setting the attribute model.speaking_rate to a chosen value. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_42.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..f220361918cca6bf53bc0496d137f08038c81539 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_42.txt @@ -0,0 +1,18 @@ +Likewise, the randomness of the noise is controlled by model.noise_scale: + +thon +import torch +from transformers import VitsTokenizer, VitsModel, set_seed +tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") +model = VitsModel.from_pretrained("facebook/mms-tts-eng") +inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt") +make deterministic +set_seed(555) +make speech faster and more noisy +model.speaking_rate = 1.5 +model.noise_scale = 0.8 +with torch.no_grad(): + outputs = model(**inputs) + +Language Identification (LID) +Different LID models are available based on the number of languages they can recognize - 126, 256, 512, 1024, 2048, 4017. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_43.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ede7b76d57842280f5a8d0a09fb8740d85b1f96 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_43.txt @@ -0,0 +1,7 @@ +Inference +First, we install transformers and some other libraries +```bash +pip install torch accelerate datasets[audio] +pip install --upgrade transformers +` +Next, we load a couple of audio samples via datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_44.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d9efd22036a55384e76ccb533c2b2a8ebd0bd5b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_44.txt @@ -0,0 +1 @@ +Make sure that the audio data is sampled to 16000 kHz. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_45.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..13580695d7fff87b075a549cd9a68a70169995bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_45.txt @@ -0,0 +1,41 @@ +from datasets import load_dataset, Audio +English +stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "en", split="test", streaming=True) +stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000)) +en_sample = next(iter(stream_data))["audio"]["array"] +Arabic +stream_data = load_dataset("mozilla-foundation/common_voice_13_0", "ar", split="test", streaming=True) +stream_data = stream_data.cast_column("audio", Audio(sampling_rate=16000)) +ar_sample = next(iter(stream_data))["audio"]["array"] + +Next, we load the model and processor + +from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor +import torch +model_id = "facebook/mms-lid-126" +processor = AutoFeatureExtractor.from_pretrained(model_id) +model = Wav2Vec2ForSequenceClassification.from_pretrained(model_id) + +Now we process the audio data, pass the processed audio data to the model to classify it into a language, just like we usually do for Wav2Vec2 audio classification models such as ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition + +English +inputs = processor(en_sample, sampling_rate=16_000, return_tensors="pt") +with torch.no_grad(): + outputs = model(**inputs).logits +lang_id = torch.argmax(outputs, dim=-1)[0].item() +detected_lang = model.config.id2label[lang_id] +'eng' +Arabic +inputs = processor(ar_sample, sampling_rate=16_000, return_tensors="pt") +with torch.no_grad(): + outputs = model(**inputs).logits +lang_id = torch.argmax(outputs, dim=-1)[0].item() +detected_lang = model.config.id2label[lang_id] +'ara' + +To see all the supported languages of a checkpoint, you can print out the language ids as follows: +py +processor.id2label.values() +Audio Pretrained Models +Pretrained models are available for two different sizes - 300M , +1Bil. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_46.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..daf221f38126b0d8f247017323bd634de78bf494 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_46.txt @@ -0,0 +1,2 @@ +The MMS for ASR architecture is based on the Wav2Vec2 model, refer to Wav2Vec2's documentation page for further +details on how to finetune with models for various downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mms/chunk_47.txt b/chunked/content_aware_chunking/model_doc_mms/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac389a95eb495a4b51eb8b0b813b48b12b8c4059 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mms/chunk_47.txt @@ -0,0 +1 @@ +MMS-TTS uses the same model architecture as VITS, refer to VITS's documentation page for API reference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbdeb6d9ffc7208bde89f3621d5c9050b9ea33bd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_10.txt @@ -0,0 +1,3 @@ +On the +natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7 (0.6 lower than BERT_BASE), and 62 ms +latency on a Pixel 4 phone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe5914883e9823a7fb5c1f4c3d74827a639929d2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_11.txt @@ -0,0 +1,2 @@ +On the SQuAD v1.1/v2.0 question answering task, MobileBERT achieves a dev F1 score of +90.0/79.2 (1.5/2.1 higher than BERT_BASE). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..7eddc67fe25be6a60f868197c7003860294829c2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by vshampor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc741df6cfd96188bef0f7d9a1e2a43e4a84f4d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_14.txt @@ -0,0 +1,4 @@ +Usage tips + +MobileBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather + than the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b534212144132c3a47a0a4675a3cfd7a8d086fcc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_15.txt @@ -0,0 +1 @@ +MobileBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..efd1565fd3e336f6554fa6ce51a152104f9a29b6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_16.txt @@ -0,0 +1,2 @@ +It is therefore + efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_17.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ca5d327cd6422d456ce83191a8376864bdb0c4f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_17.txt @@ -0,0 +1,2 @@ +Models trained + with a causal language modeling (CLM) objective are better in that regard. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_18.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..941800c6e5ac12a104c92ca33024909cbae4e0a8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_18.txt @@ -0,0 +1,67 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +MobileBertConfig +[[autodoc]] MobileBertConfig +MobileBertTokenizer +[[autodoc]] MobileBertTokenizer +MobileBertTokenizerFast +[[autodoc]] MobileBertTokenizerFast +MobileBert specific outputs +[[autodoc]] models.mobilebert.modeling_mobilebert.MobileBertForPreTrainingOutput +[[autodoc]] models.mobilebert.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput + +MobileBertModel +[[autodoc]] MobileBertModel + - forward +MobileBertForPreTraining +[[autodoc]] MobileBertForPreTraining + - forward +MobileBertForMaskedLM +[[autodoc]] MobileBertForMaskedLM + - forward +MobileBertForNextSentencePrediction +[[autodoc]] MobileBertForNextSentencePrediction + - forward +MobileBertForSequenceClassification +[[autodoc]] MobileBertForSequenceClassification + - forward +MobileBertForMultipleChoice +[[autodoc]] MobileBertForMultipleChoice + - forward +MobileBertForTokenClassification +[[autodoc]] MobileBertForTokenClassification + - forward +MobileBertForQuestionAnswering +[[autodoc]] MobileBertForQuestionAnswering + - forward + +TFMobileBertModel +[[autodoc]] TFMobileBertModel + - call +TFMobileBertForPreTraining +[[autodoc]] TFMobileBertForPreTraining + - call +TFMobileBertForMaskedLM +[[autodoc]] TFMobileBertForMaskedLM + - call +TFMobileBertForNextSentencePrediction +[[autodoc]] TFMobileBertForNextSentencePrediction + - call +TFMobileBertForSequenceClassification +[[autodoc]] TFMobileBertForSequenceClassification + - call +TFMobileBertForMultipleChoice +[[autodoc]] TFMobileBertForMultipleChoice + - call +TFMobileBertForTokenClassification +[[autodoc]] TFMobileBertForTokenClassification + - call +TFMobileBertForQuestionAnswering +[[autodoc]] TFMobileBertForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c99afb61eb272149cbcb4480927c0541588a19d4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_7.txt @@ -0,0 +1,2 @@ +To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated BERT_LARGE +model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..13ddcd8f39954f43c7ccb8f2308083e1c3777a23 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_8.txt @@ -0,0 +1 @@ +Then, we conduct knowledge transfer from this teacher to MobileBERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilebert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed3aae15f7ecdf8511810d190fa15807fe7f6066 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilebert/chunk_9.txt @@ -0,0 +1,2 @@ +Empirical studies show that MobileBERT is +4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known benchmarks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e5293ebfe5e68cda72c0a4d137353da93bacd2f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_10.txt @@ -0,0 +1 @@ +Even though the checkpoint is trained on images of specific size, the model will work on images of any size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8a7fe51c530cc87bb837c6418db16ec2469468f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_11.txt @@ -0,0 +1 @@ +The smallest supported image size is 32x32. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f1c06a35ae736d2f2e80d1e12bda16c3e45f031 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_12.txt @@ -0,0 +1 @@ +One can use [MobileNetV1ImageProcessor] to prepare images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9df4e9cfd8a5b331c4718dde79ee55e5aaf9e18 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_13.txt @@ -0,0 +1 @@ +The available image classification checkpoints are pre-trained on ImageNet-1k (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8739a9ebde10879fb3579f30205a14f09146335 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_14.txt @@ -0,0 +1 @@ +However, the model predicts 1001 classes: the 1000 classes from ImageNet plus an extra “background†class (index 0). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..52f740671f782141db8baa8b15803695b3172ff6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_15.txt @@ -0,0 +1 @@ +The original TensorFlow checkpoints use different padding rules than PyTorch, requiring the model to determine the padding amount at inference time, since this depends on the input image size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb9fa11882e68b6222a45aa996758adffb788928 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_16.txt @@ -0,0 +1 @@ +To use native PyTorch padding behavior, create a [MobileNetV1Config] with tf_padding = False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_17.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..3be5b214219d21821e1f7056b584d7481dd667e6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_17.txt @@ -0,0 +1,3 @@ +Unsupported features: + +The [MobileNetV1Model] outputs a globally pooled version of the last hidden state. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_18.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..6964b9a3d6879acf9d6719d1f4cad4e6c2cdf434 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_18.txt @@ -0,0 +1 @@ +In the original model it is possible to use a 7x7 average pooling layer with stride 2 instead of global pooling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_19.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..779648d90cc7bb487e28d5254788b5bd5d85ef61 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_19.txt @@ -0,0 +1 @@ +For larger inputs, this gives a pooled output that is larger than 1x1 pixel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_20.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..23a4aca1dd6129f3c8183282fb317d8b2448e6ee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_20.txt @@ -0,0 +1 @@ +The HuggingFace implementation does not support this. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_21.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..9075a73e29a117cb0c53c6c92094ed60f5dbbb26 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_21.txt @@ -0,0 +1 @@ +It is currently not possible to specify an output_stride. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_22.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b9a12d4ab980abe8baa02e8830c9dc55f84863f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_22.txt @@ -0,0 +1 @@ +For smaller output strides, the original model invokes dilated convolution to prevent the spatial resolution from being reduced further. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_23.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..610c4121462aa2bb7240990192471e74fe93d2c6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_23.txt @@ -0,0 +1 @@ +The output stride of the HuggingFace model is always 32. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_24.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc685aae4aaf6634b77638a929dbe0a0efefafce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_24.txt @@ -0,0 +1 @@ +The original TensorFlow checkpoints include quantized models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_25.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..16e987a727448f29cccd22bf0e0c264d7d652b26 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_25.txt @@ -0,0 +1 @@ +We do not support these models as they include additional "FakeQuantization" operations to unquantize the weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_26.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8645356805904dd29e3f2c2bdf8f49dfac09610 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_26.txt @@ -0,0 +1 @@ +It's common to extract the output from the pointwise layers at indices 5, 11, 12, 13 for downstream purposes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_27.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..a70b1d275172fb434df5cc1fa18454376b836b0b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_27.txt @@ -0,0 +1 @@ +Using output_hidden_states=True returns the output from all intermediate layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_28.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d60543bcbecaaa78d0484a3dc2b90c88c1e5b3b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_28.txt @@ -0,0 +1 @@ +There is currently no way to limit this to specific layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_29.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..803ae7025464d3ed807063c969a7a2a0890575e6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_29.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with MobileNetV1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_30.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..864c48956baed322d294e16a7db787b8d194557b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_30.txt @@ -0,0 +1 @@ +[MobileNetV1ForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_31.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_31.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_32.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_32.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_33.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b22e51453ead01073acc93975d546bd6e6c265d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_33.txt @@ -0,0 +1,14 @@ +MobileNetV1Config +[[autodoc]] MobileNetV1Config +MobileNetV1FeatureExtractor +[[autodoc]] MobileNetV1FeatureExtractor + - preprocess +MobileNetV1ImageProcessor +[[autodoc]] MobileNetV1ImageProcessor + - preprocess +MobileNetV1Model +[[autodoc]] MobileNetV1Model + - forward +MobileNetV1ForImageClassification +[[autodoc]] MobileNetV1ForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..81b0601561ff9bb89dbfac81c8888c416b779d6a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v1/chunk_9.txt @@ -0,0 +1,3 @@ +Usage tips + +The checkpoints are named mobilenet_v1_depth_size, for example mobilenet_v1_1.0_224, where 1.0 is the depth multiplier (sometimes also referred to as "alpha" or the width multiplier) and 224 is the resolution of the input images the model was trained on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..9978cf159e838fa058b5069a93ff3768b8da8ad7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_12.txt @@ -0,0 +1,3 @@ +Usage tips + +The checkpoints are named mobilenet_v2_depth_size, for example mobilenet_v2_1.0_224, where 1.0 is the depth multiplier (sometimes also referred to as "alpha" or the width multiplier) and 224 is the resolution of the input images the model was trained on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e5293ebfe5e68cda72c0a4d137353da93bacd2f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_13.txt @@ -0,0 +1 @@ +Even though the checkpoint is trained on images of specific size, the model will work on images of any size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8a7fe51c530cc87bb837c6418db16ec2469468f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_14.txt @@ -0,0 +1 @@ +The smallest supported image size is 32x32. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..7423b52fa9248235a1a347eba2962275d51531ae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_15.txt @@ -0,0 +1 @@ +One can use [MobileNetV2ImageProcessor] to prepare images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9df4e9cfd8a5b331c4718dde79ee55e5aaf9e18 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_16.txt @@ -0,0 +1 @@ +The available image classification checkpoints are pre-trained on ImageNet-1k (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_17.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8739a9ebde10879fb3579f30205a14f09146335 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_17.txt @@ -0,0 +1 @@ +However, the model predicts 1001 classes: the 1000 classes from ImageNet plus an extra “background†class (index 0). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_18.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e9c52cf305a8370689bc11930f442da37a8f37a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_18.txt @@ -0,0 +1 @@ +The segmentation model uses a DeepLabV3+ head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_19.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..be137ac83c1f1157bb58487f18cb5943b50d1ffa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_19.txt @@ -0,0 +1 @@ +The available semantic segmentation checkpoints are pre-trained on PASCAL VOC. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_20.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..52f740671f782141db8baa8b15803695b3172ff6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_20.txt @@ -0,0 +1 @@ +The original TensorFlow checkpoints use different padding rules than PyTorch, requiring the model to determine the padding amount at inference time, since this depends on the input image size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_21.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..90e57c19c1c99313514200b8f15a460c833b6530 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_21.txt @@ -0,0 +1 @@ +To use native PyTorch padding behavior, create a [MobileNetV2Config] with tf_padding = False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_22.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..6dc6493ef59308a5c390cebcfff5e1af6c7852b3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_22.txt @@ -0,0 +1,3 @@ +Unsupported features: + +The [MobileNetV2Model] outputs a globally pooled version of the last hidden state. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_23.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..18903ddf62bc0bb82602e29fd10239f604e62035 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_23.txt @@ -0,0 +1 @@ +In the original model it is possible to use an average pooling layer with a fixed 7x7 window and stride 1 instead of global pooling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_24.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f0c0d5c7d5266a2b4f4f3dea789cf130b81d8be --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_24.txt @@ -0,0 +1 @@ +For inputs that are larger than the recommended image size, this gives a pooled output that is larger than 1x1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_25.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..e44cfabbeebf6c83d99f37a0b0a67069693e8324 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_25.txt @@ -0,0 +1 @@ +The Hugging Face implementation does not support this. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_26.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc685aae4aaf6634b77638a929dbe0a0efefafce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_26.txt @@ -0,0 +1 @@ +The original TensorFlow checkpoints include quantized models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_27.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..16e987a727448f29cccd22bf0e0c264d7d652b26 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_27.txt @@ -0,0 +1 @@ +We do not support these models as they include additional "FakeQuantization" operations to unquantize the weights. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_28.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..96db65cf45ddd1f85a25ea33d63928fc5f1e00de --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_28.txt @@ -0,0 +1 @@ +It's common to extract the output from the expansion layers at indices 10 and 13, as well as the output from the final 1x1 convolution layer, for downstream purposes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_29.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..a70b1d275172fb434df5cc1fa18454376b836b0b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_29.txt @@ -0,0 +1 @@ +Using output_hidden_states=True returns the output from all intermediate layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_30.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d60543bcbecaaa78d0484a3dc2b90c88c1e5b3b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_30.txt @@ -0,0 +1 @@ +There is currently no way to limit this to specific layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_31.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..d69665b49e6150e2213427f5dfed234ae67d72e3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_31.txt @@ -0,0 +1 @@ +The DeepLabV3+ segmentation head does not use the final convolution layer from the backbone, but this layer gets computed anyway. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_32.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3d1877efd40ce291081d7527065eae6aaab932b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_32.txt @@ -0,0 +1 @@ +There is currently no way to tell [MobileNetV2Model] up to which layer it should run. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_33.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..87737ea3ddd240ff38534c3679413b879641e8fc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_33.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with MobileNetV2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_34.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..45e3012d67d9e95aa19fd01e0fd978345e92755d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_34.txt @@ -0,0 +1 @@ +[MobileNetV2ForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_35.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a615c69674a74107b4aa3e1e0caf11d765d0b2d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_35.txt @@ -0,0 +1,5 @@ +See also: Image classification task guide + +Semantic segmentation +- Semantic segmentation task guide +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_36.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_36.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_37.txt b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1a9a8a56a421ba55be2c25f07a7739a2c2cd400 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilenet_v2/chunk_37.txt @@ -0,0 +1,19 @@ +MobileNetV2Config +[[autodoc]] MobileNetV2Config +MobileNetV2FeatureExtractor +[[autodoc]] MobileNetV2FeatureExtractor + - preprocess + - post_process_semantic_segmentation +MobileNetV2ImageProcessor +[[autodoc]] MobileNetV2ImageProcessor + - preprocess + - post_process_semantic_segmentation +MobileNetV2Model +[[autodoc]] MobileNetV2Model + - forward +MobileNetV2ForImageClassification +[[autodoc]] MobileNetV2ForImageClassification + - forward +MobileNetV2ForSemanticSegmentation +[[autodoc]] MobileNetV2ForSemanticSegmentation + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..a305c2acf3d5e5488221a92e6a7096e27030015f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_10.txt @@ -0,0 +1 @@ +Our results show that MobileViT significantly outperforms CNN- and ViT-based networks across different tasks and datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1c6201b1207599b93a182862062bf12eacb4f9b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_11.txt @@ -0,0 +1 @@ +On the ImageNet-1k dataset, MobileViT achieves top-1 accuracy of 78.4% with about 6 million parameters, which is 3.2% and 6.2% more accurate than MobileNetv3 (CNN-based) and DeIT (ViT-based) for a similar number of parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d991c7955a4fd71a7d27e39e8d3cea8cc2981b8d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_12.txt @@ -0,0 +1 @@ +On the MS-COCO object detection task, MobileViT is 5.7% more accurate than MobileNetv3 for a similar number of parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..256a7be539b2b15eadb21cb751bb3023be3378bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by matthijs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea03abbd8d01784fb6204c08bf62e9711a20b91e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_14.txt @@ -0,0 +1 @@ +The TensorFlow version of the model was contributed by sayakpaul. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..54d81928f00c7e0a583e1e5c7d771ab0fc5c861e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_15.txt @@ -0,0 +1 @@ +The original code and weights can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa1fbbe15d9b186053d11d7b41e45bba2d6801f6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_16.txt @@ -0,0 +1,3 @@ +Usage tips + +MobileViT is more like a CNN than a Transformer model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_17.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..dabf8c9973dfc85eeb9fa320b1df589df2c91655 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_17.txt @@ -0,0 +1 @@ +It does not work on sequence data but on batches of images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_18.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c6b9bda369474b0e0a4ec0f183073262feeffc4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_18.txt @@ -0,0 +1 @@ +Unlike ViT, there are no embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_19.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a14be881a820671862c2a27571962dcae34e0b81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_19.txt @@ -0,0 +1 @@ +The backbone model outputs a feature map. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_20.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..9848d06623c984f61003569bcd50313a42ef6f28 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_20.txt @@ -0,0 +1 @@ +You can follow this tutorial for a lightweight introduction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_21.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..71a0d3e9e1625aadfca3d255055ba961235eadde --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_21.txt @@ -0,0 +1 @@ +One can use [MobileViTImageProcessor] to prepare images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_22.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f40d1b7832502db29769a57dd422d8ab7e53212 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_22.txt @@ -0,0 +1 @@ +Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_23.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9df4e9cfd8a5b331c4718dde79ee55e5aaf9e18 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_23.txt @@ -0,0 +1 @@ +The available image classification checkpoints are pre-trained on ImageNet-1k (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_24.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..bef3a95df19e9c10a9ff73bae218e844d5679a77 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_24.txt @@ -0,0 +1 @@ +The segmentation model uses a DeepLabV3 head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_25.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..be137ac83c1f1157bb58487f18cb5943b50d1ffa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_25.txt @@ -0,0 +1 @@ +The available semantic segmentation checkpoints are pre-trained on PASCAL VOC. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_26.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..418e41b8bbfa2b3c1355c1404b8598d7e7a77e91 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_26.txt @@ -0,0 +1 @@ +As the name suggests MobileViT was designed to be performant and efficient on mobile phones. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_27.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa3924c8b9dd8c57e9b1706f0452771d967fc7c3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_27.txt @@ -0,0 +1 @@ +The TensorFlow versions of the MobileViT models are fully compatible with TensorFlow Lite. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_28.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ed877c8633e208f448ebc147952bad74f2a0f35 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_28.txt @@ -0,0 +1,20 @@ +You can use the following code to convert a MobileViT checkpoint (be it image classification or semantic segmentation) to generate a + TensorFlow Lite model: + +from transformers import TFMobileViTForImageClassification +import tensorflow as tf +model_ckpt = "apple/mobilevit-xx-small" +model = TFMobileViTForImageClassification.from_pretrained(model_ckpt) +converter = tf.lite.TFLiteConverter.from_keras_model(model) +converter.optimizations = [tf.lite.Optimize.DEFAULT] +converter.target_spec.supported_ops = [ + tf.lite.OpsSet.TFLITE_BUILTINS, + tf.lite.OpsSet.SELECT_TF_OPS, +] +tflite_model = converter.convert() +tflite_filename = model_ckpt.split("/")[-1] + ".tflite" +with open(tflite_filename, "wb") as f: + f.write(tflite_model) + +The resulting model will be just about an MB making it a good fit for mobile applications where resources and network + bandwidth can be constrained. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_29.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..a44654f84340087b1d4f570eb0145c4a65d1f7a7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_29.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with MobileViT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_30.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..732d1e52f74870237b6b6963a5e3a799072a8689 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_30.txt @@ -0,0 +1 @@ +[MobileViTForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_31.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a615c69674a74107b4aa3e1e0caf11d765d0b2d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_31.txt @@ -0,0 +1,5 @@ +See also: Image classification task guide + +Semantic segmentation +- Semantic segmentation task guide +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_32.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_32.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_33.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0b248acd26635555df8247b53e55d61262d9ef4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_33.txt @@ -0,0 +1,30 @@ +MobileViTConfig +[[autodoc]] MobileViTConfig +MobileViTFeatureExtractor +[[autodoc]] MobileViTFeatureExtractor + - call + - post_process_semantic_segmentation +MobileViTImageProcessor +[[autodoc]] MobileViTImageProcessor + - preprocess + - post_process_semantic_segmentation + +MobileViTModel +[[autodoc]] MobileViTModel + - forward +MobileViTForImageClassification +[[autodoc]] MobileViTForImageClassification + - forward +MobileViTForSemanticSegmentation +[[autodoc]] MobileViTForSemanticSegmentation + - forward + +TFMobileViTModel +[[autodoc]] TFMobileViTModel + - call +TFMobileViTForImageClassification +[[autodoc]] TFMobileViTForImageClassification + - call +TFMobileViTForSemanticSegmentation +[[autodoc]] TFMobileViTForSemanticSegmentation + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..57432c938dcc9d7df794ba76deafc00097081de7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_7.txt @@ -0,0 +1 @@ +In this paper, we ask the following question: is it possible to combine the strengths of CNNs and ViTs to build a light-weight and low latency network for mobile vision tasks? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fb4efc8ab14f1dc448e46ec41a17f3ad03f8b14 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_8.txt @@ -0,0 +1 @@ +Towards this end, we introduce MobileViT, a light-weight and general-purpose vision transformer for mobile devices. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevit/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..34395e6246bd37f52ba1f8e5665e6854556d0d10 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevit/chunk_9.txt @@ -0,0 +1 @@ +MobileViT presents a different perspective for the global processing of information with transformers, i.e., transformers as convolutions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1f5abf1b531dbc8d7b8453fa41953dbdcde0cd6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by shehan97. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3fd3100c8814ec432f88d611b70893f632d5b70 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_12.txt @@ -0,0 +1,3 @@ +Usage tips + +MobileViTV2 is more like a CNN than a Transformer model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..dabf8c9973dfc85eeb9fa320b1df589df2c91655 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_13.txt @@ -0,0 +1 @@ +It does not work on sequence data but on batches of images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c6b9bda369474b0e0a4ec0f183073262feeffc4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_14.txt @@ -0,0 +1 @@ +Unlike ViT, there are no embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..a14be881a820671862c2a27571962dcae34e0b81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_15.txt @@ -0,0 +1 @@ +The backbone model outputs a feature map. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..71a0d3e9e1625aadfca3d255055ba961235eadde --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_16.txt @@ -0,0 +1 @@ +One can use [MobileViTImageProcessor] to prepare images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_17.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f40d1b7832502db29769a57dd422d8ab7e53212 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_17.txt @@ -0,0 +1 @@ +Note that if you do your own preprocessing, the pretrained checkpoints expect images to be in BGR pixel order (not RGB). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_18.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9df4e9cfd8a5b331c4718dde79ee55e5aaf9e18 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_18.txt @@ -0,0 +1 @@ +The available image classification checkpoints are pre-trained on ImageNet-1k (also referred to as ILSVRC 2012, a collection of 1.3 million images and 1,000 classes). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_19.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..bef3a95df19e9c10a9ff73bae218e844d5679a77 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_19.txt @@ -0,0 +1 @@ +The segmentation model uses a DeepLabV3 head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_20.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..be137ac83c1f1157bb58487f18cb5943b50d1ffa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_20.txt @@ -0,0 +1 @@ +The available semantic segmentation checkpoints are pre-trained on PASCAL VOC. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_21.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..fcdd7438fe0a85782f16a663c61c9c571b70fd0c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_21.txt @@ -0,0 +1,11 @@ +MobileViTV2Config +[[autodoc]] MobileViTV2Config +MobileViTV2Model +[[autodoc]] MobileViTV2Model + - forward +MobileViTV2ForImageClassification +[[autodoc]] MobileViTV2ForImageClassification + - forward +MobileViTV2ForSemanticSegmentation +[[autodoc]] MobileViTV2ForSemanticSegmentation + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_6.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..06cdaadb3964215f95e94e93c1d68aad74849b03 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_6.txt @@ -0,0 +1 @@ +O(k). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7ab29e3ca5bb9f8bf59854fd7baca2ecb20e701 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_7.txt @@ -0,0 +1 @@ +A simple yet effective characteristic of the proposed method is that it uses element-wise operations for computing self-attention, making it a good choice for resource-constrained devices. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..7466fa2b04672ea024a31483869492336d3dfede --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_8.txt @@ -0,0 +1 @@ +The improved model, MobileViTV2, is state-of-the-art on several mobile vision tasks, including ImageNet object classification and MS-COCO object detection. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce66522c95f7996fdfd42fe40e2b894e2fd97a86 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mobilevitv2/chunk_9.txt @@ -0,0 +1 @@ +With about three million parameters, MobileViTV2 achieves a top-1 accuracy of 75.6% on the ImageNet dataset, outperforming MobileViT by about 1% while running 3.2× faster on a mobile device. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpnet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mpnet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..eefabd09ca19c037d275f10ef96c8a0d619bbf6f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpnet/chunk_10.txt @@ -0,0 +1,2 @@ +Usage tips +MPNet doesn't have token_type_ids, you don't need to indicate which token belongs to which segment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpnet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mpnet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d9ecc0a9d3fb8c673637591058fc103de99b0a4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpnet/chunk_11.txt @@ -0,0 +1,2 @@ +Just +separate your segments with the separation token tokenizer.sep_token (or [sep]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpnet/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mpnet/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1d31149a3f7173aa5ae6c5ec86ff20b72875b21 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpnet/chunk_12.txt @@ -0,0 +1,56 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +MPNetConfig +[[autodoc]] MPNetConfig +MPNetTokenizer +[[autodoc]] MPNetTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +MPNetTokenizerFast +[[autodoc]] MPNetTokenizerFast + +MPNetModel +[[autodoc]] MPNetModel + - forward +MPNetForMaskedLM +[[autodoc]] MPNetForMaskedLM + - forward +MPNetForSequenceClassification +[[autodoc]] MPNetForSequenceClassification + - forward +MPNetForMultipleChoice +[[autodoc]] MPNetForMultipleChoice + - forward +MPNetForTokenClassification +[[autodoc]] MPNetForTokenClassification + - forward +MPNetForQuestionAnswering +[[autodoc]] MPNetForQuestionAnswering + - forward + +TFMPNetModel +[[autodoc]] TFMPNetModel + - call +TFMPNetForMaskedLM +[[autodoc]] TFMPNetForMaskedLM + - call +TFMPNetForSequenceClassification +[[autodoc]] TFMPNetForSequenceClassification + - call +TFMPNetForMultipleChoice +[[autodoc]] TFMPNetForMultipleChoice + - call +TFMPNetForTokenClassification +[[autodoc]] TFMPNetForTokenClassification + - call +TFMPNetForQuestionAnswering +[[autodoc]] TFMPNetForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpnet/chunk_6.txt b/chunked/content_aware_chunking/model_doc_mpnet/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b88fac73d45e591ee499b59bd5d7fc3112fd3b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpnet/chunk_6.txt @@ -0,0 +1,4 @@ +MPNet leverages the +dependency among predicted tokens through permuted language modeling (vs. MLM in BERT), and takes auxiliary position +information as input to make the model see a full sentence and thus reducing the position discrepancy (vs. PLM in +XLNet). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpnet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mpnet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..df877427d89be4fa651829785b7dd7324186040c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpnet/chunk_7.txt @@ -0,0 +1,2 @@ +We pre-train MPNet on a large-scale dataset (over 160GB text corpora) and fine-tune on a variety of +down-streaming tasks (GLUE, SQuAD, etc). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpnet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mpnet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffa40cae6f63bdeb5922bc906b970b9acf25225b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpnet/chunk_8.txt @@ -0,0 +1,3 @@ +Experimental results show that MPNet outperforms MLM and PLM by a large +margin, and achieves better results on these tasks compared with previous state-of-the-art pre-trained methods (e.g., +BERT, XLNet, RoBERTa) under the same model setting. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpnet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mpnet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpnet/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpt/chunk_2.txt b/chunked/content_aware_chunking/model_doc_mpt/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a92f6e0c36350ae0b20c493f933f32ab7f254aa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpt/chunk_2.txt @@ -0,0 +1 @@ +MPT models are GPT-style decoder-only transformers with several improvements: performance-optimized layer implementations, architecture changes that provide greater training stability, and the elimination of context length limits by replacing positional embeddings with ALiBi. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpt/chunk_3.txt b/chunked/content_aware_chunking/model_doc_mpt/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f506515f3fa22fa40b611d57a326f5673b9be2a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpt/chunk_3.txt @@ -0,0 +1,5 @@ +MPT base: MPT base pre-trained models on next token prediction +MPT instruct: MPT base models fine-tuned on instruction based tasks +MPT storywriter: MPT base models fine-tuned for 2500 steps on 65k-token excerpts of fiction books contained in the books3 corpus, this enables the model to handle very long sequences + +The original code is available at the llm-foundry repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpt/chunk_4.txt b/chunked/content_aware_chunking/model_doc_mpt/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..774bebbc21f5d6a19b2b37320bb0ce30eacfddb0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpt/chunk_4.txt @@ -0,0 +1,5 @@ +Read more about it in the release blogpost +Usage tips + +Learn more about some techniques behind training of the model in this section of llm-foundry repository +If you want to use the advanced version of the model (triton kernels, direct flash attention integration), you can still use the original model implementation by adding trust_remote_code=True when calling from_pretrained. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpt/chunk_5.txt b/chunked/content_aware_chunking/model_doc_mpt/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b632e5352b5386502a70d31624d25c854eeb3c3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpt/chunk_5.txt @@ -0,0 +1,3 @@ +Resources + +Fine-tuning Notebook on how to fine-tune MPT-7B on a free Google Colab instance to turn the model into a Chatbot. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mpt/chunk_6.txt b/chunked/content_aware_chunking/model_doc_mpt/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..644e61dd019ed26133555170ce5d8d72a4408d87 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mpt/chunk_6.txt @@ -0,0 +1,18 @@ +MptConfig +[[autodoc]] MptConfig + - all +MptModel +[[autodoc]] MptModel + - forward +MptForCausalLM +[[autodoc]] MptForCausalLM + - forward +MptForSequenceClassification +[[autodoc]] MptForSequenceClassification + - forward +MptForTokenClassification +[[autodoc]] MptForTokenClassification + - forward +MptForQuestionAnswering +[[autodoc]] MptForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mra/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mra/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..91782b49d9537ddab31ef0994b16bc2c06b062dc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mra/chunk_10.txt @@ -0,0 +1,20 @@ +MraConfig +[[autodoc]] MraConfig +MraModel +[[autodoc]] MraModel + - forward +MraForMaskedLM +[[autodoc]] MraForMaskedLM + - forward +MraForSequenceClassification +[[autodoc]] MraForSequenceClassification + - forward +MraForMultipleChoice +[[autodoc]] MraForMultipleChoice + - forward +MraForTokenClassification +[[autodoc]] MraForTokenClassification + - forward +MraForQuestionAnswering +[[autodoc]] MraForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mra/chunk_4.txt b/chunked/content_aware_chunking/model_doc_mra/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..20f7acb590faa18e42dd591cfd361583645a1c89 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mra/chunk_4.txt @@ -0,0 +1 @@ +In this paper, we revisit classical Multiresolution Analysis (MRA) concepts such as Wavelets, whose potential value in this setting remains underexplored thus far. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mra/chunk_5.txt b/chunked/content_aware_chunking/model_doc_mra/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..64ab3b1b89988bc8faadde0bb7a9a2e86b898ecf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mra/chunk_5.txt @@ -0,0 +1 @@ +We show that simple approximations based on empirical feedback and design choices informed by modern hardware and implementation challenges, eventually yield a MRA-based approach for self-attention with an excellent performance profile across most criteria of interest. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mra/chunk_6.txt b/chunked/content_aware_chunking/model_doc_mra/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..88e570a27f97a537df029c14f0ac1db01070f973 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mra/chunk_6.txt @@ -0,0 +1 @@ +We undertake an extensive set of experiments and demonstrate that this multi-resolution scheme outperforms most efficient self-attention proposals and is favorable for both short and long sequences. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mra/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mra/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b8827c21c81a1ab02ba08882803032b735fef06 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mra/chunk_7.txt @@ -0,0 +1 @@ +Code is available at https://github.com/mlpen/mra-attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mra/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mra/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe063d0f82bdd9b95cdc2cee77e4423709b2ab51 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mra/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by novice03. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mra/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mra/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mra/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mt5/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mt5/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..632218a4405befd1ef4e9eca96d08a28bd5fb0b5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mt5/chunk_10.txt @@ -0,0 +1,11 @@ +Google has released the following variants: + +google/mt5-small + +google/mt5-base + +google/mt5-large + +google/mt5-xl + +google/mt5-xxl. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mt5/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mt5/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8d9a4a80c39655451ecdeff278033b32e2fe584 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mt5/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by patrickvonplaten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mt5/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mt5/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf67fffa4f5d9001fd37fd419894f4c6591262b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mt5/chunk_12.txt @@ -0,0 +1,2 @@ +The original code can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mt5/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mt5/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..7aed7123c3e528e2e6906ea111d40a13a745cc31 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mt5/chunk_13.txt @@ -0,0 +1,10 @@ +Resources + +Translation task guide +Summarization task guide + +MT5Config +[[autodoc]] MT5Config +MT5Tokenizer +[[autodoc]] MT5Tokenizer +See [T5Tokenizer] for all details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mt5/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mt5/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..36d6503e5776971ae399f7ae4bb69dbb9dbae8ca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mt5/chunk_14.txt @@ -0,0 +1,3 @@ +MT5TokenizerFast +[[autodoc]] MT5TokenizerFast +See [T5TokenizerFast] for all details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mt5/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mt5/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f6d4001df7269bec6ce6784fdd2e2aa7907aa49 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mt5/chunk_15.txt @@ -0,0 +1,26 @@ +MT5Model +[[autodoc]] MT5Model +MT5ForConditionalGeneration +[[autodoc]] MT5ForConditionalGeneration +MT5EncoderModel +[[autodoc]] MT5EncoderModel +MT5ForSequenceClassification +[[autodoc]] MT5ForSequenceClassification +MT5ForTokenClassification +[[autodoc]] MT5ForTokenClassification +MT5ForQuestionAnswering +[[autodoc]] MT5ForQuestionAnswering + +TFMT5Model +[[autodoc]] TFMT5Model +TFMT5ForConditionalGeneration +[[autodoc]] TFMT5ForConditionalGeneration +TFMT5EncoderModel +[[autodoc]] TFMT5EncoderModel + +FlaxMT5Model +[[autodoc]] FlaxMT5Model +FlaxMT5ForConditionalGeneration +[[autodoc]] FlaxMT5ForConditionalGeneration +FlaxMT5EncoderModel +[[autodoc]] FlaxMT5EncoderModel \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mt5/chunk_5.txt b/chunked/content_aware_chunking/model_doc_mt5/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..93691f939232d602e19af165095da383935cf2ed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mt5/chunk_5.txt @@ -0,0 +1,2 @@ +All of the code and model +checkpoints used in this work are publicly available. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mt5/chunk_6.txt b/chunked/content_aware_chunking/model_doc_mt5/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..28130f651725e8ee2c5438d9e1b1a6b82501b206 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mt5/chunk_6.txt @@ -0,0 +1 @@ +Note: mT5 was only pre-trained on mC4 excluding any supervised training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mt5/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mt5/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccca8a3540563470a22f72ed41e4a10b2365dfe6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mt5/chunk_7.txt @@ -0,0 +1 @@ +Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mt5/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mt5/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..f11c1f639516ce344e0d8d3c3a62db8d3324d4eb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mt5/chunk_8.txt @@ -0,0 +1,2 @@ +Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task +fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mt5/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mt5/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b02171176dfea8686a381c899b340100c72beb4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mt5/chunk_9.txt @@ -0,0 +1 @@ +If you are doing multi-task fine-tuning, you should use a prefix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_19.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a5a2a1b2b8a8368666653df75631b7b9f4f5235 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_19.txt @@ -0,0 +1,3 @@ +Sampling is enabled by default, +and can be explicitly specified by setting do_sample=True in the call to [MusicgenForConditionalGeneration.generate], +or by overriding the model's generation config (see below). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_20.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec26c49de58dc62956563a80611ce7b50d688cb7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_20.txt @@ -0,0 +1 @@ +Generation is limited by the sinusoidal positional embeddings to 30 second inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_21.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..846af708e4ff0fc6bda31caabb97b6e86557678a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_21.txt @@ -0,0 +1,3 @@ +Meaning, MusicGen cannot generate more +than 30 seconds of audio (1503 tokens), and input audio passed by Audio-Prompted Generation contributes to this limit so, +given an input of 20 seconds of audio, MusicGen cannot generate more than 10 seconds of additional audio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_22.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..619fb7dc01d5108aacc1fad47a00dc7727f71490 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_22.txt @@ -0,0 +1 @@ +Transformers supports both mono (1-channel) and stereo (2-channel) variants of MusicGen. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_23.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..dded3e422d118c974e1d16013772328223541505 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_23.txt @@ -0,0 +1,2 @@ +The mono channel versions +generate a single set of codebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_24.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd6950245ac810adf61b68dddfae111278438ca5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_24.txt @@ -0,0 +1,2 @@ +The stereo versions generate 2 sets of codebooks, 1 for each channel (left/right), +and each set of codebooks is decoded independently through the audio compression model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_25.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ca95c8c9c9eb8699cdb56fed7311b8ab95e643c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_25.txt @@ -0,0 +1,2 @@ +The audio streams for each +channel are combined to give the final stereo output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_26.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3486bebc84871fb4aa450ea33279d250645a22d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_26.txt @@ -0,0 +1,11 @@ +Unconditional Generation +The inputs for unconditional (or 'null') generation can be obtained through the method +[MusicgenForConditionalGeneration.get_unconditional_inputs]: +thon + +from transformers import MusicgenForConditionalGeneration +model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") +unconditional_inputs = model.get_unconditional_inputs(num_samples=1) +audio_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=256) + +The audio outputs are a three-dimensional Torch tensor of shape (batch_size, num_channels, sequence_length). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_27.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d72ee2067c331caaa4b2784669550b272421ea0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_27.txt @@ -0,0 +1,8 @@ +To listen +to the generated audio samples, you can either play them in an ipynb notebook: +thon +from IPython.display import Audio +sampling_rate = model.config.audio_encoder.sampling_rate +Audio(audio_values[0].numpy(), rate=sampling_rate) + +Or save them as a .wav file using a third-party library, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_28.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9694e4fc8d75260771b21c1a1bffacbc0f5fbeb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_28.txt @@ -0,0 +1,25 @@ +scipy: +thon + +import scipy +sampling_rate = model.config.audio_encoder.sampling_rate +scipy.io.wavfile.write("musicgen_out.wav", rate=sampling_rate, data=audio_values[0, 0].numpy()) + +Text-Conditional Generation +The model can generate an audio sample conditioned on a text prompt through use of the [MusicgenProcessor] to pre-process +the inputs: +thon + +from transformers import AutoProcessor, MusicgenForConditionalGeneration +processor = AutoProcessor.from_pretrained("facebook/musicgen-small") +model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") +inputs = processor( + text=["80s pop track with bassy drums and synth", "90s rock song with loud guitars and heavy drums"], + padding=True, + return_tensors="pt", + ) +audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256) + +The guidance_scale is used in classifier free guidance (CFG), setting the weighting between the conditional logits +(which are predicted from the text prompts) and the unconditional logits (which are predicted from an unconditional or +'null' prompt). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_29.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca047523261d9bd1f532121bd3aa88bc082395f5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_29.txt @@ -0,0 +1,2 @@ +Higher guidance scale encourages the model to generate samples that are more closely linked to the input +prompt, usually at the expense of poorer audio quality. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_30.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..3adb749be2e45db3e0b47e321d97a38b46b36818 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_30.txt @@ -0,0 +1 @@ +CFG is enabled by setting guidance_scale > 1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_31.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..57d064b2601da9803ce975b7f6e4ccaf21a329f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_31.txt @@ -0,0 +1,2 @@ +For best results, +use guidance_scale=3 (default). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_32.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a6d9a89a5064df56d284ef3a91bc18676b4131c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_32.txt @@ -0,0 +1,2 @@ +Audio-Prompted Generation +The same [MusicgenProcessor] can be used to pre-process an audio prompt that is used for audio continuation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_33.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..c067c15d77ce65fb3e0236d16800cfbfccc8ccf4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_33.txt @@ -0,0 +1,67 @@ +In the +following example, we load an audio file using the 🤗 Datasets library, which can be pip installed through the command +below: + +pip install --upgrade pip +pip install datasets[audio] +thon + +from transformers import AutoProcessor, MusicgenForConditionalGeneration +from datasets import load_dataset +processor = AutoProcessor.from_pretrained("facebook/musicgen-small") +model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") +dataset = load_dataset("sanchit-gandhi/gtzan", split="train", streaming=True) +sample = next(iter(dataset))["audio"] +take the first half of the audio sample +sample["array"] = sample["array"][: len(sample["array"]) // 2] +inputs = processor( + audio=sample["array"], + sampling_rate=sample["sampling_rate"], + text=["80s blues track with groovy saxophone"], + padding=True, + return_tensors="pt", + ) +audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256) + +For batched audio-prompted generation, the generated audio_values can be post-processed to remove padding by using the +[MusicgenProcessor] class: +thon + +from transformers import AutoProcessor, MusicgenForConditionalGeneration +from datasets import load_dataset +processor = AutoProcessor.from_pretrained("facebook/musicgen-small") +model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") +dataset = load_dataset("sanchit-gandhi/gtzan", split="train", streaming=True) +sample = next(iter(dataset))["audio"] +take the first quarter of the audio sample +sample_1 = sample["array"][: len(sample["array"]) // 4] +take the first half of the audio sample +sample_2 = sample["array"][: len(sample["array"]) // 2] +inputs = processor( + audio=[sample_1, sample_2], + sampling_rate=sample["sampling_rate"], + text=["80s blues track with groovy saxophone", "90s rock song with loud guitars and heavy drums"], + padding=True, + return_tensors="pt", + ) +audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256) +post-process to remove padding from the batched audio +audio_values = processor.batch_decode(audio_values, padding_mask=inputs.padding_mask) + +Generation Configuration +The default parameters that control the generation process, such as sampling, guidance scale and number of generated +tokens, can be found in the model's generation config, and updated as desired: +thon + +from transformers import MusicgenForConditionalGeneration +model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") +inspect the default generation config +model.generation_config +increase the guidance scale to 4.0 +model.generation_config.guidance_scale = 4.0 +decrease the max length to 256 tokens +model.generation_config.max_length = 256 + +Note that any arguments passed to the generate method will supersede those in the generation config, so setting +do_sample=False in the call to generate will supersede the setting of model.generation_config.do_sample in the +generation config. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_34.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..08f24bd954a396ec0df49ee00dc6220db3722f76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_34.txt @@ -0,0 +1,3 @@ +Model Structure +The MusicGen model can be de-composed into three distinct stages: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_35.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b41ea739fa93bd91fbd4dd981f9670343edbb1d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_35.txt @@ -0,0 +1 @@ +Text encoder: maps the text inputs to a sequence of hidden-state representations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_36.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fe711df2267c283e1fe5ea2bf1e2d8f1b7685b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_36.txt @@ -0,0 +1,2 @@ +The pre-trained MusicGen models use a frozen text encoder from either T5 or Flan-T5 +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_37.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6fc173d55be25b9f5f0cf8501ffef27fe976729 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_37.txt @@ -0,0 +1,2 @@ +MusicGen decoder: a language model (LM) that auto-regressively generates audio tokens (or codes) conditional on the encoder hidden-state representations +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_38.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ed498882e9941a21e573c0bef15d0693e730017 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_38.txt @@ -0,0 +1,4 @@ +Audio encoder/decoder: used to encode an audio prompt to use as prompt tokens, and recover the audio waveform from the audio tokens predicted by the decoder +Thus, the MusicGen model can either be used as a standalone decoder model, corresponding to the class [MusicgenForCausalLM], +or as a composite model that includes the text encoder and audio encoder/decoder, corresponding to the class +[MusicgenForConditionalGeneration]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_39.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..32ae1f2e43de5ca9fae902867a439255d62e8742 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_39.txt @@ -0,0 +1,13 @@ +If only the decoder needs to be loaded from the pre-trained checkpoint, it can be loaded by first +specifying the correct config, or be accessed through the .decoder attribute of the composite model: +thon + +from transformers import AutoConfig, MusicgenForCausalLM, MusicgenForConditionalGeneration +Option 1: get decoder config and pass to .from_pretrained +decoder_config = AutoConfig.from_pretrained("facebook/musicgen-small").decoder +decoder = MusicgenForCausalLM.from_pretrained("facebook/musicgen-small", **decoder_config) +Option 2: load the entire composite model, but only return the decoder +decoder = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small").decoder + +Since the text encoder and audio encoder/decoder models are frozen during training, the MusicGen decoder [MusicgenForCausalLM] +can be trained standalone on a dataset of encoder hidden-states and audio codes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_40.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..f126cf0ff10818b8be8ee5da6af3af6e07681c2e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_40.txt @@ -0,0 +1,3 @@ +For inference, the trained decoder can +be combined with the frozen text encoder and audio encoder/decoders to recover the composite [MusicgenForConditionalGeneration] +model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_41.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..70ca52bb733886cd4d72e393be7d65760f483ff8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_41.txt @@ -0,0 +1,2 @@ +Tips: +* MusicGen is trained on the 32kHz checkpoint of Encodec. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_42.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ff3089a7c4c5e8e3b987d05bb2fc544883c7dc5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_42.txt @@ -0,0 +1 @@ +You should ensure you use a compatible version of the Encodec model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_musicgen/chunk_43.txt b/chunked/content_aware_chunking/model_doc_musicgen/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c8957156b94d4a525f56e85ff5e26f79a27aa15 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_musicgen/chunk_43.txt @@ -0,0 +1,16 @@ +* Sampling mode tends to deliver better results than greedy - you can toggle sampling with the variable do_sample in the call to [MusicgenForConditionalGeneration.generate] +MusicgenDecoderConfig +[[autodoc]] MusicgenDecoderConfig +MusicgenConfig +[[autodoc]] MusicgenConfig +MusicgenProcessor +[[autodoc]] MusicgenProcessor +MusicgenModel +[[autodoc]] MusicgenModel + - forward +MusicgenForCausalLM +[[autodoc]] MusicgenForCausalLM + - forward +MusicgenForConditionalGeneration +[[autodoc]] MusicgenForConditionalGeneration + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_10.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..90445a40081d884881a49fdaad7bf7ac394cca49 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_10.txt @@ -0,0 +1 @@ +If you want to use a model with task-specific prompts, such as summarization, you can load it through MvpForConditionalGeneration.from_pretrained('RUCAIBox/mvp-summarization'). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_11.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0283c6885f223cabb110d796ddc17d08e39fd84 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_11.txt @@ -0,0 +1 @@ +Our model supports lightweight prompt tuning following Prefix-tuning with method set_lightweight_tuning(). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_12.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba12062b1e8af35732dc1ca8f302c8f78c2b08a8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_12.txt @@ -0,0 +1,2 @@ +Usage examples +For summarization, it is an example to use MVP and MVP with summarization-specific prompts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_13.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..1127333399912cb542824eb3d79777e7017b5ef1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_13.txt @@ -0,0 +1,8 @@ +thon + +from transformers import MvpTokenizer, MvpForConditionalGeneration +tokenizer = MvpTokenizer.from_pretrained("RUCAIBox/mvp") +model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp") +model_with_prompt = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp-summarization") +inputs = tokenizer( + "Summarize: You may want to stick it to your boss and leave your job, but don't do it if these are your reasons. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_14.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d6ae4e935bd1814c965b965b4588538d14b8425 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_14.txt @@ -0,0 +1,11 @@ +", + return_tensors="pt", + ) +generated_ids = model.generate(**inputs) +tokenizer.batch_decode(generated_ids, skip_special_tokens=True) +["Why You Shouldn't Quit Your Job"] +generated_ids = model_with_prompt.generate(**inputs) +tokenizer.batch_decode(generated_ids, skip_special_tokens=True) +["Don't do it if these are your reasons"] + +For data-to-text generation, it is an example to use MVP and multi-task pre-trained variants. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_15.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..2372840b6691778ed901c9fa8e744a50a1315018 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_15.txt @@ -0,0 +1,16 @@ +thon + +from transformers import MvpTokenizerFast, MvpForConditionalGeneration +tokenizer = MvpTokenizerFast.from_pretrained("RUCAIBox/mvp") +model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp") +model_with_mtl = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mtl-data-to-text") +inputs = tokenizer( + "Describe the following data: Iron Man | instance of | Superhero [SEP] Stan Lee | creator | Iron Man", + return_tensors="pt", + ) +generated_ids = model.generate(**inputs) +tokenizer.batch_decode(generated_ids, skip_special_tokens=True) +['Stan Lee created the character of Iron Man, a fictional superhero appearing in American comic'] +generated_ids = model_with_mtl.generate(**inputs) +tokenizer.batch_decode(generated_ids, skip_special_tokens=True) +['Iron Man is a fictional superhero appearing in American comic books published by Marvel Comics.'] \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_16.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..2039f422a96ad06fdc50a2795487ebf74f1ea919 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_16.txt @@ -0,0 +1 @@ +For lightweight tuning, i.e., fixing the model and only tuning prompts, you can load MVP with randomly initialized prompts or with task-specific prompts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_17.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..787d69a8e8889c63add054093f69fb8ea1f9c41d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_17.txt @@ -0,0 +1 @@ +Our code also supports Prefix-tuning with BART following the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_18.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbd038ad1917a9cf5bebaf0ea55c1cd3a9a9d3f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_18.txt @@ -0,0 +1,49 @@ +thon + +from transformers import MvpForConditionalGeneration +model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mvp", use_prompt=True) +the number of trainable parameters (full tuning) +sum(p.numel() for p in model.parameters() if p.requires_grad) +468116832 +lightweight tuning with randomly initialized prompts +model.set_lightweight_tuning() +the number of trainable parameters (lightweight tuning) +sum(p.numel() for p in model.parameters() if p.requires_grad) +61823328 +lightweight tuning with task-specific prompts +model = MvpForConditionalGeneration.from_pretrained("RUCAIBox/mtl-data-to-text") +model.set_lightweight_tuning() +original lightweight Prefix-tuning +model = MvpForConditionalGeneration.from_pretrained("facebook/bart-large", use_prompt=True) +model.set_lightweight_tuning() + +Resources + +Text classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Translation task guide +Summarization task guide + +MvpConfig +[[autodoc]] MvpConfig +MvpTokenizer +[[autodoc]] MvpTokenizer +MvpTokenizerFast +[[autodoc]] MvpTokenizerFast +MvpModel +[[autodoc]] MvpModel + - forward +MvpForConditionalGeneration +[[autodoc]] MvpForConditionalGeneration + - forward +MvpForSequenceClassification +[[autodoc]] MvpForSequenceClassification + - forward +MvpForQuestionAnswering +[[autodoc]] MvpForQuestionAnswering + - forward +MvpForCausalLM +[[autodoc]] MvpForCausalLM + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_5.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f49764d7c1a399e75091c43fab7ebc8f7d461d5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_5.txt @@ -0,0 +1 @@ +Our model can also be adapted to natural language understanding tasks such as sequence classification and (extractive) question answering. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_6.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..56f07048fd0b2b651b371f0d1ea61d5ddd2613b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_6.txt @@ -0,0 +1 @@ +This model was contributed by Tianyi Tang. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_7.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..53ee0b8a89aa71d1b27812b08100106583701f1c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_7.txt @@ -0,0 +1 @@ +The detailed information and instructions can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_8.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..acbede2876b7b5474ce8698f4331cd039c273b6a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_8.txt @@ -0,0 +1,3 @@ +Usage tips + +We have released a series of models here, including MVP, MVP with task-specific prompts, and multi-task pre-trained variants. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_mvp/chunk_9.txt b/chunked/content_aware_chunking/model_doc_mvp/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..7353281324157083cc0ad76021cc051e9f1ac3c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_mvp/chunk_9.txt @@ -0,0 +1 @@ +If you want to use a model without prompts (standard Transformer), you can load it through MvpForConditionalGeneration.from_pretrained('RUCAIBox/mvp'). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_10.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..be967ace8a50cc07c821db5cea592e0657a9123a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by Ali Hassani. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_11.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_12.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d8b684b2a9f0fd1a02989936f4098eea6b7a0f5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_12.txt @@ -0,0 +1,3 @@ +Usage tips + +One can use the [AutoImageProcessor] API to prepare images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_13.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..3956d1aa0f71c26a5e3904fc58909608330d2aeb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_13.txt @@ -0,0 +1 @@ +NAT can be used as a backbone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_14.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..8436ad15d61ede6b4cbad9fcec731c0a9bef3219 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_14.txt @@ -0,0 +1,2 @@ +When output_hidden_states = True, +it will output both hidden_states and reshaped_hidden_states. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_15.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ddeeacc61d738c221ac91624c6ee3baba7b57f4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_15.txt @@ -0,0 +1,2 @@ +The reshaped_hidden_states have a shape of (batch, num_channels, height, width) rather than +(batch_size, height, width, num_channels). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_16.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b87e6625aeac01f7920aee6cc554b35447ff590 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_16.txt @@ -0,0 +1,2 @@ +Notes: +- NAT depends on NATTEN's implementation of Neighborhood Attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_17.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..fde565929fe10c30c0954e324fc1f817bf3d26e8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_17.txt @@ -0,0 +1,2 @@ +You can install it with pre-built wheels for Linux by referring to shi-labs.com/natten, +or build on your system by running pip install natten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_18.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c4d10e3c11ce2138f41f46f874e53cda6493362 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_18.txt @@ -0,0 +1 @@ +Note that the latter will likely take time to compile. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_19.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0e4261edf35c392458979a925aae606347d8dbc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_19.txt @@ -0,0 +1 @@ +NATTEN does not support Windows devices yet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_20.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..c51930e990013fd56cc6990deb6de1de4090e99b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_20.txt @@ -0,0 +1 @@ +- Patch size of 4 is only supported at the moment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_21.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..101ee2468c9d248395ffb6dcd6e07c1dc8da4644 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_21.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with NAT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_22.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c2c708ad66122181115c3c6dd59225e2adc2a89 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_22.txt @@ -0,0 +1 @@ +[NatForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_23.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_23.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_24.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_24.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_25.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..e21fcd29a298b040b6c3a463ec7672e08ba29f09 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_25.txt @@ -0,0 +1,8 @@ +NatConfig +[[autodoc]] NatConfig +NatModel +[[autodoc]] NatModel + - forward +NatForImageClassification +[[autodoc]] NatForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_7.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..92168fb4383d08e93771deca4bc105fc87b9cc14 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_7.txt @@ -0,0 +1,3 @@ +Experimental results on NAT are competitive; +NAT-Tiny reaches 83.2% top-1 accuracy on ImageNet, 51.4% mAP on MS-COCO and 48.4% mIoU on ADE20K, which is 1.9% +ImageNet accuracy, 1.0% COCO mAP, and 2.6% ADE20K mIoU improvement over a Swin model with similar size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_8.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..dac645e836e4c090766eb11b2364c6dd3836815b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_8.txt @@ -0,0 +1,3 @@ +* + + Neighborhood Attention compared to other attention patterns. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nat/chunk_9.txt b/chunked/content_aware_chunking/model_doc_nat/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nat/chunk_9.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nezha/chunk_5.txt b/chunked/content_aware_chunking/model_doc_nezha/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..45bcfa61c37c9af8a1ec451ff639a1bfe8929dda --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nezha/chunk_5.txt @@ -0,0 +1 @@ +This model was contributed by sijunhe. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nezha/chunk_6.txt b/chunked/content_aware_chunking/model_doc_nezha/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nezha/chunk_6.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nezha/chunk_7.txt b/chunked/content_aware_chunking/model_doc_nezha/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5cf5751d2c1018995a24e96180afc2eea15e10e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nezha/chunk_7.txt @@ -0,0 +1,34 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +NezhaConfig +[[autodoc]] NezhaConfig +NezhaModel +[[autodoc]] NezhaModel + - forward +NezhaForPreTraining +[[autodoc]] NezhaForPreTraining + - forward +NezhaForMaskedLM +[[autodoc]] NezhaForMaskedLM + - forward +NezhaForNextSentencePrediction +[[autodoc]] NezhaForNextSentencePrediction + - forward +NezhaForSequenceClassification +[[autodoc]] NezhaForSequenceClassification + - forward +NezhaForMultipleChoice +[[autodoc]] NezhaForMultipleChoice + - forward +NezhaForTokenClassification +[[autodoc]] NezhaForTokenClassification + - forward +NezhaForQuestionAnswering +[[autodoc]] NezhaForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_10.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab9617ef2517f302473dd89cd52cc9b95ae1cba6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by Arthur Zucker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_11.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_12.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d900d5faaf63098bf52e8781a64461c39746917 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_12.txt @@ -0,0 +1,4 @@ +Usage tips + +M2M100ForConditionalGeneration is the base model for both NLLB and NLLB MoE +The NLLB-MoE is very similar to the NLLB model, but it's feed forward layer is based on the implementation of SwitchTransformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_13.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..59f1d18913d5be4d688a9128182a2294bca40f76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_13.txt @@ -0,0 +1 @@ +The tokenizer is the same as the NLLB models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_14.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5e9327fefebcb1da06f5c22baea757e94be0ec3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_14.txt @@ -0,0 +1,2 @@ +Implementation differences with SwitchTransformers +The biggest difference is the way the tokens are routed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_15.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..490dfaec01147a2decbc5c9f0ba8c395f1bc80c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_15.txt @@ -0,0 +1,2 @@ +NLLB-MoE uses a top-2-gate which means that for each input, only the top two experts are selected based on the +highest predicted probabilities from the gating network, and the remaining experts are ignored. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_16.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..f73912c1701e739f460d4b0f4ad882f800d1ea2b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_16.txt @@ -0,0 +1,2 @@ +In SwitchTransformers, only the top-1 probabilities are computed, +which means that tokens have less probability of being forwarded. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_17.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e34c803633df7db9a359c91d87556db9ed5575a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_17.txt @@ -0,0 +1,2 @@ +Moreover, if a token is not routed to any expert, SwitchTransformers still adds its unmodified hidden +states (kind of like a residual connection) while they are masked in NLLB's top-2 routing mechanism. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_18.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..035575183dd10f98e54e588787603e4e0fd5c468 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_18.txt @@ -0,0 +1,2 @@ +Generating with NLLB-MoE +The available checkpoints require around 350GB of storage. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_19.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ee8506e8a8cf21ed2a0815aecef7268ba51b8e5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_19.txt @@ -0,0 +1 @@ +Make sure to use accelerate if you do not have enough RAM on your machine. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_20.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..f791260c6feb2c11dd2a155e7f86f227c3b938dc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_20.txt @@ -0,0 +1 @@ +While generating the target text set the forced_bos_token_id to the target language id. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_21.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..cde1607eaf1fd6d0298039e5688178e1f38a3027 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_21.txt @@ -0,0 +1,2 @@ +The following +example shows how to translate English to French using the facebook/nllb-200-distilled-600M model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_22.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2479d98a531ad2225eda0ff9472267b01ba7678 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_22.txt @@ -0,0 +1 @@ +Note that we're using the BCP-47 code for French fra_Latn. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_23.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..f357b1c764a30fb08a39412d4f37bd8e1f110f92 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_23.txt @@ -0,0 +1,2 @@ +See here +for the list of all BCP-47 in the Flores 200 dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_24.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1382785e084ce341799f242fa7653d29458db2e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_24.txt @@ -0,0 +1,6 @@ +thon + +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-moe-54b") +model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-moe-54b") +article = "Previously, Ring's CEO, Jamie Siminoff, remarked the company started when his doorbell wasn't audible from his shop in his garage." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_25.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..38dcd090d51e8a7837ff1acbac27ceb3290c20a2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_25.txt @@ -0,0 +1,6 @@ +inputs = tokenizer(article, return_tensors="pt") +translated_tokens = model.generate( + **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"], max_length=50 + ) +tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] +"Auparavant, le PDG de Ring, Jamie Siminoff, a fait remarquer que la société avait commencé lorsque sa sonnette n'était pas audible depuis son magasin dans son garage." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_26.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..abdc72511d35b7703ff65ede8f1acc5e506bc6b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_26.txt @@ -0,0 +1,2 @@ +Generating from any other language than English +English (eng_Latn) is set as the default language from which to translate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_27.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d89c813bc4da97b0e264c4a193f800c601ce726 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_27.txt @@ -0,0 +1,2 @@ +In order to specify that you'd like to translate from a different language, +you should specify the BCP-47 code in the src_lang keyword argument of the tokenizer initialization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_28.txt b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..de2802ede0e12084a5b352c8d144cde01baaaaf4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb-moe/chunk_28.txt @@ -0,0 +1,33 @@ +See example below for a translation from romanian to german: +thon + +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-moe-54b", src_lang="ron_Latn") +model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-moe-54b") +article = "Åžeful ONU spune că nu există o soluÅ£ie militară în Siria" +inputs = tokenizer(article, return_tensors="pt") +translated_tokens = model.generate( + **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=30 + ) +tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] + +Resources + +Translation task guide +Summarization task guide + +NllbMoeConfig +[[autodoc]] NllbMoeConfig +NllbMoeTop2Router +[[autodoc]] NllbMoeTop2Router + - route_tokens + - forward +NllbMoeSparseMLP +[[autodoc]] NllbMoeSparseMLP + - forward +NllbMoeModel +[[autodoc]] NllbMoeModel + - forward +NllbMoeForConditionalGeneration +[[autodoc]] NllbMoeForConditionalGeneration + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_10.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c773f1f348b25f458dbd50188b9281a6938536d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_10.txt @@ -0,0 +1 @@ +However, such efforts have coalesced around a small subset of languages, leaving behind the vast majority of mostly low-resource languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_11.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdc07c4526f6d27a4b404936256ba90e6dad1e86 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_11.txt @@ -0,0 +1,2 @@ +What does it take to break the +200 language barrier while ensuring safe, high quality results, all while keeping ethical considerations in mind? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_12.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f6d082fce10f922144beff250f586c0ca5b0722 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_12.txt @@ -0,0 +1,2 @@ +In No Language Left Behind, we took on this challenge by +first contextualizing the need for low-resource language translation support through exploratory interviews with native speakers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_13.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b05fe0b5cb9ad9498747cb279b239aa1dfd1764 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_13.txt @@ -0,0 +1,2 @@ +Then, we created datasets and models aimed +at narrowing the performance gap between low and high-resource languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_14.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0a647d8a71d9da0ddc330c48938095bb9c6972f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_14.txt @@ -0,0 +1,2 @@ +More specifically, we developed a conditional compute model based on Sparsely Gated Mixture of +Experts that is trained on data obtained with novel and effective data mining techniques tailored for low-resource languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_15.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed7b00c0b17fdc1f96c8a0624905fa9ce246ae9b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_15.txt @@ -0,0 +1,2 @@ +We propose multiple architectural and training +improvements to counteract overfitting while training on thousands of tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_16.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..a204a9e08167294f8a6c05c079d852b341cdfda2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_16.txt @@ -0,0 +1,2 @@ +Critically, we evaluated the performance of over 40,000 different translation directions using +a human-translated benchmark, Flores-200, and combined human evaluation with a novel toxicity benchmark covering all languages in Flores-200 to assess translation safety. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_17.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..cab406c1192fa2652164ea97a5bf132a6c91fd0c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_17.txt @@ -0,0 +1 @@ +Our model achieves an improvement of 44% BLEU relative to the previous state-of-the-art, laying important groundwork towards realizing a universal translation system. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_18.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..33fe22d9e4050569638ca05e371dd8703d6eca6a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_18.txt @@ -0,0 +1 @@ +This implementation contains the dense models available on release. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_19.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..35680a2d162e701dc3a5bb2237c21e26772bf1ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_19.txt @@ -0,0 +1 @@ +The sparse model NLLB-MoE (Mixture of Expert) is now available! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_20.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..68b94215d2b11111a344abe091c7bbf681c9c59d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_20.txt @@ -0,0 +1,2 @@ +More details here +This model was contributed by Lysandre. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_21.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..2134d6fd2d03c54920ac0f0ea65d58fe383cc87e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_21.txt @@ -0,0 +1 @@ +The authors' code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_22.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..77823a658c4639861750cf2553ca8a48e2e58b37 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_22.txt @@ -0,0 +1,2 @@ +Generating with NLLB +While generating the target text set the forced_bos_token_id to the target language id. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_23.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..cde1607eaf1fd6d0298039e5688178e1f38a3027 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_23.txt @@ -0,0 +1,2 @@ +The following +example shows how to translate English to French using the facebook/nllb-200-distilled-600M model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_24.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2479d98a531ad2225eda0ff9472267b01ba7678 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_24.txt @@ -0,0 +1 @@ +Note that we're using the BCP-47 code for French fra_Latn. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_25.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..f357b1c764a30fb08a39412d4f37bd8e1f110f92 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_25.txt @@ -0,0 +1,2 @@ +See here +for the list of all BCP-47 in the Flores 200 dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_26.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea2eee756e34969b07e59a686a5f4af4190264c3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_26.txt @@ -0,0 +1,15 @@ +thon + +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M") +model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M") +article = "UN Chief says there is no military solution in Syria" +inputs = tokenizer(article, return_tensors="pt") +translated_tokens = model.generate( + **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"], max_length=30 + ) +tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] +Le chef de l'ONU dit qu'il n'y a pas de solution militaire en Syrie + +Generating from any other language than English +English (eng_Latn) is set as the default language from which to translate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_27.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d89c813bc4da97b0e264c4a193f800c601ce726 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_27.txt @@ -0,0 +1,2 @@ +In order to specify that you'd like to translate from a different language, +you should specify the BCP-47 code in the src_lang keyword argument of the tokenizer initialization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nllb/chunk_28.txt b/chunked/content_aware_chunking/model_doc_nllb/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..359dcd156552746bb642c31dd8650b1ad74e2ac1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nllb/chunk_28.txt @@ -0,0 +1,25 @@ +See example below for a translation from romanian to german: + +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained( + "facebook/nllb-200-distilled-600M", token=True, src_lang="ron_Latn" + ) +model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M", token=True) +article = "Åžeful ONU spune că nu există o soluÅ£ie militară în Siria" +inputs = tokenizer(article, return_tensors="pt") +translated_tokens = model.generate( + **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=30 + ) +tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] +UN-Chef sagt, es gibt keine militärische Lösung in Syrien + +Resources + +Translation task guide +Summarization task guide + +NllbTokenizer +[[autodoc]] NllbTokenizer + - build_inputs_with_special_tokens +NllbTokenizerFast +[[autodoc]] NllbTokenizerFast \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_10.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f16a20874db68468d36337353044b26ede99569 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_10.txt @@ -0,0 +1,2 @@ +The original code can be found +here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_11.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4abfb526a843e2435d4377ac5769cad6618980f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_11.txt @@ -0,0 +1,5 @@ +Usage tips + +The quickest way to get started with Nougat is by checking the tutorial + notebooks, which show how to use the model + at inference time as well as fine-tuning on custom data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_12.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b46d64e549b2d00e20d4fe7e032f00bb7292f71 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_12.txt @@ -0,0 +1 @@ +Nougat is always used within the VisionEncoderDecoder framework. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_13.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..183ccaf38824dc4af1266f6939cc2f1596e5aafd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_13.txt @@ -0,0 +1 @@ +The model is identical to Donut in terms of architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_14.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6497d66e5a74afdc59d1f5e95273791c1106756 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_14.txt @@ -0,0 +1,3 @@ +Inference +Nougat's [VisionEncoderDecoder] model accepts images as input and makes use of +[~generation.GenerationMixin.generate] to autoregressively generate text given the input image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_15.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..89303782a1bce5800653009e8d4cc9d1c70731df --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_15.txt @@ -0,0 +1,2 @@ +The [NougatImageProcessor] class is responsible for preprocessing the input image and +[NougatTokenizerFast] decodes the generated target tokens to the target string. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_16.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c737e454a79da4fc4a1af2eb9d9b5d1dba45e33 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_16.txt @@ -0,0 +1,3 @@ +The +[NougatProcessor] wraps [NougatImageProcessor] and [NougatTokenizerFast] classes +into a single instance to both extract the input features and decode the predicted token ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_17.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..03dcbf8ec2cf345f333e7cc3382a2119be4b9873 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_17.txt @@ -0,0 +1,30 @@ +Step-by-step PDF transcription + +from huggingface_hub import hf_hub_download +import re +from PIL import Image +from transformers import NougatProcessor, VisionEncoderDecoderModel +from datasets import load_dataset +import torch +processor = NougatProcessor.from_pretrained("facebook/nougat-base") +model = VisionEncoderDecoderModel.from_pretrained("facebook/nougat-base") +device = "cuda" if torch.cuda.is_available() else "cpu" +model.to(device) # doctest: +IGNORE_RESULT +prepare PDF image for the model +filepath = hf_hub_download(repo_id="hf-internal-testing/fixtures_docvqa", filename="nougat_paper.png", repo_type="dataset") +image = Image.open(filepath) +pixel_values = processor(image, return_tensors="pt").pixel_values +generate transcription (here we only generate 30 tokens) +outputs = model.generate( + pixel_values.to(device), + min_length=1, + max_new_tokens=30, + bad_words_ids=[[processor.tokenizer.unk_token_id]], + ) +sequence = processor.batch_decode(outputs, skip_special_tokens=True)[0] +sequence = processor.post_process_generation(sequence, fix_markdown=False) +note: we're using repr here such for the sake of printing the \n characters, feel free to just print the sequence +print(repr(sequence)) +'\n\n# Nougat: Neural Optical Understanding for Academic Documents\n\n Lukas Blecher\n\nCorrespondence to: lblecher@' + +See the model hub to look for Nougat checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_18.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..183ccaf38824dc4af1266f6939cc2f1596e5aafd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_18.txt @@ -0,0 +1 @@ +The model is identical to Donut in terms of architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_19.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..d93fe289a7dc0b406ddfdd549425715ec5e93c6f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_19.txt @@ -0,0 +1,13 @@ +NougatImageProcessor +[[autodoc]] NougatImageProcessor + - preprocess +NougatTokenizerFast +[[autodoc]] NougatTokenizerFast +NougatProcessor +[[autodoc]] NougatProcessor + - call + - from_pretrained + - save_pretrained + - batch_decode + - decode + - post_process_generation \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_6.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cfebc09ed238a8e113773ec4bbfb94d7bfba3c5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_6.txt @@ -0,0 +1 @@ +We release the models and code to accelerate future work on scientific text recognition. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_7.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b60cb910bbfb7ac30b8c07ea7eb76c11947e048 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_7.txt @@ -0,0 +1 @@ +Nougat high-level overview. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_8.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_8.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nougat/chunk_9.txt b/chunked/content_aware_chunking/model_doc_nougat/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nougat/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nystromformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe063d0f82bdd9b95cdc2cee77e4423709b2ab51 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by novice03. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nystromformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nystromformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..726bad7e8fa88a3bfc770cbcd93e14518c033c84 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_12.txt @@ -0,0 +1,28 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +NystromformerConfig +[[autodoc]] NystromformerConfig +NystromformerModel +[[autodoc]] NystromformerModel + - forward +NystromformerForMaskedLM +[[autodoc]] NystromformerForMaskedLM + - forward +NystromformerForSequenceClassification +[[autodoc]] NystromformerForSequenceClassification + - forward +NystromformerForMultipleChoice +[[autodoc]] NystromformerForMultipleChoice + - forward +NystromformerForTokenClassification +[[autodoc]] NystromformerForTokenClassification + - forward +NystromformerForQuestionAnswering +[[autodoc]] NystromformerForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nystromformer/chunk_5.txt b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..688a56ccb16a9cbcb7d9526ee851f42a32771d5f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_5.txt @@ -0,0 +1,2 @@ +Our idea is based on adapting the Nyström method to approximate standard self-attention +with O(n) complexity. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nystromformer/chunk_6.txt b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..3aa37013eb06c933f7a752c0433f50313af6c7ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_6.txt @@ -0,0 +1,2 @@ +The scalability of Nyströmformer enables application to longer sequences with thousands of +tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nystromformer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..4850273803d5433f76b63b855408782d236de7c4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_7.txt @@ -0,0 +1,3 @@ +We perform evaluations on multiple downstream tasks on the GLUE benchmark and IMDB reviews with standard +sequence length, and find that our Nyströmformer performs comparably, or in a few cases, even slightly better, than +standard self-attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nystromformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8c8d18654d057baad05d6025da13efca4951c97 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_8.txt @@ -0,0 +1,2 @@ +On longer sequence tasks in the Long Range Arena (LRA) benchmark, Nyströmformer performs +favorably relative to other efficient self-attention methods. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_nystromformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..74940f6035e9a5e3959ffb1fef09aa0212ee6e2d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_nystromformer/chunk_9.txt @@ -0,0 +1 @@ +Our code is available at this https URL. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d675b682d3f0e55cf008b2255397e4fbd55b4d81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_10.txt @@ -0,0 +1 @@ +Thirdly, we propose using a query-text contrastive loss during training to establish better inter-task and inter-class distinctions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5133b77c4fd47392b8e9dc72de846bc786a5150 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_11.txt @@ -0,0 +1 @@ +Notably, our single OneFormer model outperforms specialized Mask2Former models across all three segmentation tasks on ADE20k, CityScapes, and COCO, despite the latter being trained on each of the three tasks individually with three times the resources. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..e637e1cce6bd607e8a52cb9b0c11022e7c210c27 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_12.txt @@ -0,0 +1 @@ +With new ConvNeXt and DiNAT backbones, we observe even more performance improvement. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..17370b1bfea46d904503dbce72cfe2b9f964e1dd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_13.txt @@ -0,0 +1 @@ +We believe OneFormer is a significant step towards making image segmentation more universal and accessible. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..504999b85ddb9413ae920637c1d176f24a6f8bc1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_14.txt @@ -0,0 +1 @@ +The figure below illustrates the architecture of OneFormer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_15.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..0105e8c56cc39e11d7983edfa4fab3b308b89d9c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_16.txt @@ -0,0 +1 @@ +This model was contributed by Jitesh Jain. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_17.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_17.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_18.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..e09fbbc795b38694a3cb319532465f7e1f62472f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_18.txt @@ -0,0 +1,3 @@ +Usage tips + +OneFormer requires two inputs during inference: image and task token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_19.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..513c6173015a8070c1cda74d9177b494204f578a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_19.txt @@ -0,0 +1 @@ +During training, OneFormer only uses panoptic annotations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_20.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b342a0dd8c8405a2ddada7140237f019b3ec4c3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_20.txt @@ -0,0 +1,2 @@ +If you want to train the model in a distributed environment across multiple nodes, then one should update the + get_num_masks function inside in the OneFormerLoss class of modeling_oneformer.py. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_21.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..3882d6f8d7c092f15f5ec229efefb81db3f84854 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_21.txt @@ -0,0 +1,2 @@ +When training on multiple nodes, this should be + set to the average number of target masks across all nodes, as can be seen in the original implementation here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_22.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..88c9537fdaf596b375b62b7d2b4f81dbdac59309 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_22.txt @@ -0,0 +1 @@ +One can use [OneFormerProcessor] to prepare input images and task inputs for the model and optional targets for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_23.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..5589c13e6d20b0464ba5bed4aca26e256d23dac4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_23.txt @@ -0,0 +1 @@ +[OneformerProcessor] wraps [OneFormerImageProcessor] and [CLIPTokenizer] into a single instance to both prepare the images and encode the task inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_24.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..12404d8912732d3f591b700f8b24011f41fb0c1f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_24.txt @@ -0,0 +1 @@ +To get the final segmentation, depending on the task, you can call [~OneFormerProcessor.post_process_semantic_segmentation] or [~OneFormerImageProcessor.post_process_instance_segmentation] or [~OneFormerImageProcessor.post_process_panoptic_segmentation]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_25.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f3ac3183cab34a2231ed24856689ee72bd551ca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_25.txt @@ -0,0 +1 @@ +All three tasks can be solved using [OneFormerForUniversalSegmentation] output, panoptic segmentation accepts an optional label_ids_to_fuse argument to fuse instances of the target object/s (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_26.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..8748f43c1dfe000b767514d7874060aa06935397 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_26.txt @@ -0,0 +1 @@ +sky) together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_27.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..a41ba240118f80a7f30e9220fc20e15e523b7472 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_27.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OneFormer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_28.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..af0d8cf1de164d51e267d8cf90e07a2ead0036ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_28.txt @@ -0,0 +1 @@ +Demo notebooks regarding inference + fine-tuning on custom data can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_29.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7b997fc410fec140a8b1f0a0f029593247bf8b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_29.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_30.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_30.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_oneformer/chunk_31.txt b/chunked/content_aware_chunking/model_doc_oneformer/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff1ff30619ff3cd0827f54f8412e8d0befb08abc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_oneformer/chunk_31.txt @@ -0,0 +1,20 @@ +OneFormer specific outputs +[[autodoc]] models.oneformer.modeling_oneformer.OneFormerModelOutput +[[autodoc]] models.oneformer.modeling_oneformer.OneFormerForUniversalSegmentationOutput +OneFormerConfig +[[autodoc]] OneFormerConfig +OneFormerImageProcessor +[[autodoc]] OneFormerImageProcessor + - preprocess + - encode_inputs + - post_process_semantic_segmentation + - post_process_instance_segmentation + - post_process_panoptic_segmentation +OneFormerProcessor +[[autodoc]] OneFormerProcessor +OneFormerModel +[[autodoc]] OneFormerModel + - forward +OneFormerForUniversalSegmentation +[[autodoc]] OneFormerForUniversalSegmentation + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_open-llama/chunk_2.txt b/chunked/content_aware_chunking/model_doc_open-llama/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fe499e26bce24044d572a19f123ca3f1495e84d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_open-llama/chunk_2.txt @@ -0,0 +1 @@ +You can do so by running the following command: pip install -U transformers==4.31.0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_open-llama/chunk_3.txt b/chunked/content_aware_chunking/model_doc_open-llama/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b93e25ac1536d9be6382ac7d39d7b976fe9b214 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_open-llama/chunk_3.txt @@ -0,0 +1 @@ +This model differs from the OpenLLaMA models on the Hugging Face Hub, which primarily use the LLaMA architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_open-llama/chunk_4.txt b/chunked/content_aware_chunking/model_doc_open-llama/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..59ee080fb851b3192d12713bb1995f7e3c765703 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_open-llama/chunk_4.txt @@ -0,0 +1,2 @@ +Overview +The Open-Llama model was proposed in the open source Open-Llama project by community developer s-JoL. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_open-llama/chunk_5.txt b/chunked/content_aware_chunking/model_doc_open-llama/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..e48a35086f5635251f139d4cef4e42b9442545e7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_open-llama/chunk_5.txt @@ -0,0 +1 @@ +The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PaLM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_open-llama/chunk_6.txt b/chunked/content_aware_chunking/model_doc_open-llama/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..e140ea4d38153e5d1237469ce405989924c7b122 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_open-llama/chunk_6.txt @@ -0,0 +1 @@ +And the model is pre-trained on both Chinese and English, which gives it better performance on Chinese language tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_open-llama/chunk_7.txt b/chunked/content_aware_chunking/model_doc_open-llama/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6887216adcf23691b7dc878841b352060f45c68 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_open-llama/chunk_7.txt @@ -0,0 +1 @@ +This model was contributed by s-JoL. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_open-llama/chunk_8.txt b/chunked/content_aware_chunking/model_doc_open-llama/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..cabaeaf3c9d3542b94214b585c31e2465f75b2bd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_open-llama/chunk_8.txt @@ -0,0 +1 @@ +The original code was released on GitHub by s-JoL, but is now removed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_open-llama/chunk_9.txt b/chunked/content_aware_chunking/model_doc_open-llama/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b8181c4a14ecdee97cc0e7e4aa73c45e5e6eb8d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_open-llama/chunk_9.txt @@ -0,0 +1,11 @@ +OpenLlamaConfig +[[autodoc]] OpenLlamaConfig +OpenLlamaModel +[[autodoc]] OpenLlamaModel + - forward +OpenLlamaForCausalLM +[[autodoc]] OpenLlamaForCausalLM + - forward +OpenLlamaForSequenceClassification +[[autodoc]] OpenLlamaForSequenceClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_10.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..649897ffdb873807ea08b834f0650ff587d9718e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by thomwolf. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f5503f4ac19afee0dd16d9eb5c6f1c87ca2aae4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_12.txt @@ -0,0 +1,4 @@ +Usage tips + +GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than + the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..944ac34f056927a6ed3634e66b0a9041c8c40c89 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_13.txt @@ -0,0 +1,2 @@ +GPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next + token in a sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..387e3dd4d3e56b9c870479538d486179e85c9d33 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_14.txt @@ -0,0 +1,2 @@ +Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be + observed in the run_generation.py example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_15.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc6891d3b1c8914bff94a0c87aa29cb0e4698665 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_15.txt @@ -0,0 +1,8 @@ +Note: +If you want to reproduce the original tokenization process of the OpenAI GPT paper, you will need to install ftfy +and SpaCy: + +pip install spacy ftfy==4.4.3 +python -m spacy download en +If you don't install ftfy and SpaCy, the [OpenAIGPTTokenizer] will default to tokenize +using BERT's BasicTokenizer followed by Byte-Pair Encoding (which should be fine for most usage, don't worry). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_16.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a905312b954997886a4c94de898711d3c2b4682 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_16.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OpenAI GPT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_17.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_17.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_18.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_18.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_19.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8129a7f72b855596c3122684a6064e847701b29d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_19.txt @@ -0,0 +1 @@ +A blog post on outperforming OpenAI GPT-3 with SetFit for text-classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_20.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..2733e42cf09cdf477fcf5b9750559b3273bd11ca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_20.txt @@ -0,0 +1,3 @@ +See also: Text classification task guide + +A blog on how to Finetune a non-English GPT-2 Model with Hugging Face. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_21.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..9847b4c44426f124c054595b507104228d9c8427 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_21.txt @@ -0,0 +1 @@ +A blog on How to generate text: using different decoding methods for language generation with Transformers with GPT-2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_22.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..142d83994faff59f097875dd54dddf343c79d621 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_22.txt @@ -0,0 +1 @@ +A blog on Training CodeParrot 🦜 from Scratch, a large GPT-2 model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_23.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee895a91f3b0799b622284e171d8a04b6b52d537 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_23.txt @@ -0,0 +1 @@ +A blog on Faster Text Generation with TensorFlow and XLA with GPT-2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_24.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..dca3a023484a29bd8a4a4b021eebd228de4fca99 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_24.txt @@ -0,0 +1 @@ +A blog on How to train a Language Model with Megatron-LM with a GPT-2 model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_25.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6c6d0abc64d9e3489dd231d247aef25d113109d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_25.txt @@ -0,0 +1 @@ +A notebook on how to finetune GPT2 to generate lyrics in the style of your favorite artist. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_26.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d06f544bfcadd33caa11b9818ffcb202eb968689 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_26.txt @@ -0,0 +1,2 @@ +🌎 +A notebook on how to finetune GPT2 to generate tweets in the style of your favorite Twitter user. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_27.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc36d8d0346391c9d13b6f2d3660fa0d712d712c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_27.txt @@ -0,0 +1,2 @@ +🌎 +Causal language modeling chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_28.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..b726c8ef964b58b9b1c364fa334a7c220a234553 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_28.txt @@ -0,0 +1 @@ +[OpenAIGPTLMHeadModel] is supported by this causal language modeling example script, text generation example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_29.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad0112ab285a273cb89e5753350eaeece920b0b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_29.txt @@ -0,0 +1 @@ +[TFOpenAIGPTLMHeadModel] is supported by this causal language modeling example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_30.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c9e2aeae8b3472ebc5df48e4038671954ae9c61 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_30.txt @@ -0,0 +1,3 @@ +See also: Causal language modeling task guide + +A course material on Byte-Pair Encoding tokenization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_31.txt b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..b787f9ce471497dc84c4b3bcf818788e75822583 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_openai-gpt/chunk_31.txt @@ -0,0 +1,36 @@ +OpenAIGPTConfig +[[autodoc]] OpenAIGPTConfig +OpenAIGPTTokenizer +[[autodoc]] OpenAIGPTTokenizer + - save_vocabulary +OpenAIGPTTokenizerFast +[[autodoc]] OpenAIGPTTokenizerFast +OpenAI specific outputs +[[autodoc]] models.openai.modeling_openai.OpenAIGPTDoubleHeadsModelOutput +[[autodoc]] models.openai.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput + +OpenAIGPTModel +[[autodoc]] OpenAIGPTModel + - forward +OpenAIGPTLMHeadModel +[[autodoc]] OpenAIGPTLMHeadModel + - forward +OpenAIGPTDoubleHeadsModel +[[autodoc]] OpenAIGPTDoubleHeadsModel + - forward +OpenAIGPTForSequenceClassification +[[autodoc]] OpenAIGPTForSequenceClassification + - forward + +TFOpenAIGPTModel +[[autodoc]] TFOpenAIGPTModel + - call +TFOpenAIGPTLMHeadModel +[[autodoc]] TFOpenAIGPTLMHeadModel + - call +TFOpenAIGPTDoubleHeadsModel +[[autodoc]] TFOpenAIGPTDoubleHeadsModel + - call +TFOpenAIGPTForSequenceClassification +[[autodoc]] TFOpenAIGPTForSequenceClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_10.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b5c008ceec6fbe34d3eb23f0db2898b97b572c2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_10.txt @@ -0,0 +1,2 @@ +Tips: +- OPT has the same architecture as [BartDecoder]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1c3809f8ea2ad958b4233049be989c1bdd729a9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_11.txt @@ -0,0 +1 @@ +- Contrary to GPT2, OPT adds the EOS token to the beginning of every prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ee59233b36ff42badfea41a66d8efcd79d96b47 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_12.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OPT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d4bb4816a618f57f55fff31e763d1d476f98b05 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_13.txt @@ -0,0 +1,2 @@ +If you're +interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_14.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_15.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ade049899cc681f79814b65d5425786cda2e9d1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_15.txt @@ -0,0 +1 @@ +A notebook on fine-tuning OPT with PEFT, bitsandbytes, and Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_16.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5bc36205bd21c81de6d2e96725a103534b9c18f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_16.txt @@ -0,0 +1,2 @@ +🌎 +A blog post on decoding strategies with OPT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_17.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..188d4e3ef86b3c7cb2d8fa95dfb0d6cef3918ced --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_17.txt @@ -0,0 +1 @@ +Causal language modeling chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_18.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..d761f82a4b563f1b8b8ac5d14e67331558b07cfd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_18.txt @@ -0,0 +1 @@ +[OPTForCausalLM] is supported by this causal language modeling example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_19.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f189fc3e530779dccac6402f54358cada98dbd7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_19.txt @@ -0,0 +1 @@ +[TFOPTForCausalLM] is supported by this causal language modeling example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_20.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..015cbe7228d399d492f65478f51db688dec22732 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_20.txt @@ -0,0 +1 @@ +[FlaxOPTForCausalLM] is supported by this causal language modeling example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_21.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bd58a09ebf75a72945facbd11a46aa155593071 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_21.txt @@ -0,0 +1,2 @@ +Text classification task guide +[OPTForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_22.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..00e1b19c2bff8399b46581179a7a10c68de0c17e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_22.txt @@ -0,0 +1 @@ +[OPTForQuestionAnswering] is supported by this question answering example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_23.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..29ac3e270dfe2eb0ff4a3150694e0150453d81ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_23.txt @@ -0,0 +1,2 @@ +Question answering chapter + of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_24.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..20eaf74331e21eff1c4fbd4d7cbf1bd864e8e802 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_24.txt @@ -0,0 +1,3 @@ +âš¡ï¸ Inference + +A blog post on How 🤗 Accelerate runs very large models thanks to PyTorch with OPT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_25.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c77e35a7531e25e29efcf6da7794f8b974d7908 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_25.txt @@ -0,0 +1,2 @@ +Combining OPT and Flash Attention 2 +First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_26.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..43c8befe42aecbcd19b04130e166207c73a46264 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_26.txt @@ -0,0 +1,2 @@ +pip install -U flash-attn --no-build-isolation +Make also sure that you have a hardware that is compatible with Flash-Attention 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_27.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f54478ededdb4c998599671bfd287599f84cc76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_27.txt @@ -0,0 +1 @@ +Read more about it in the official documentation of flash-attn repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_28.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ef85380a7fd818d955330819ccf432ff686d273 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_28.txt @@ -0,0 +1 @@ +Make also sure to load your model in half-precision (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_29.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bda3a49ca5a00118f79d30f648777c654176ef0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_29.txt @@ -0,0 +1,12 @@ +`torch.float16``) +To load and run a model using Flash Attention 2, refer to the snippet below: +thon + +import torch +from transformers import OPTForCausalLM, GPT2Tokenizer +device = "cuda" # the device to load the model onto +model = OPTForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="flash_attention_2") +tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m") +prompt = ("A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the " + "Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived " + "there?") \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_30.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c893300fab3029a0495da636857f18e6de8bf2d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_30.txt @@ -0,0 +1,8 @@ +model_inputs = tokenizer([prompt], return_tensors="pt").to(device) +model.to(device) +generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False) +tokenizer.batch_decode(generated_ids)[0] +'A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived there?\nStatue: I have lived here for about a year.\nHuman: What is your favorite place to eat?\nStatue: I love' + +Expected speedups +Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using facebook/opt-2.7b checkpoint and the Flash Attention 2 version of the model using two different sequence lengths. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_31.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..1cd3d43942d849b60ad18031570b35a2750f0659 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_31.txt @@ -0,0 +1 @@ +Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using facebook/opt-350m checkpoint and the Flash Attention 2 version of the model using two different sequence lengths. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_32.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba05881892751f388890e738503fd6faa6d1e4d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_32.txt @@ -0,0 +1,29 @@ +OPTConfig +[[autodoc]] OPTConfig + +OPTModel +[[autodoc]] OPTModel + - forward +OPTForCausalLM +[[autodoc]] OPTForCausalLM + - forward +OPTForSequenceClassification +[[autodoc]] OPTForSequenceClassification + - forward +OPTForQuestionAnswering +[[autodoc]] OPTForQuestionAnswering + - forward + +TFOPTModel +[[autodoc]] TFOPTModel + - call +TFOPTForCausalLM +[[autodoc]] TFOPTForCausalLM + - call + +FlaxOPTModel +[[autodoc]] FlaxOPTModel + - call +FlaxOPTForCausalLM +[[autodoc]] FlaxOPTForCausalLM + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_7.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5008b7d4daedf1c29ae0a5521b74e7085c63625 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_7.txt @@ -0,0 +1 @@ +We are also releasing our logbook detailing the infrastructure challenges we faced, along with code for experimenting with all of the released models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_8.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..75268bd99f63b6e45100416c815148e92c29f060 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by Arthur Zucker, Younes Belkada, and Patrick Von Platen. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_opt/chunk_9.txt b/chunked/content_aware_chunking/model_doc_opt/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_opt/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..deaa342e47131f32194affec9185dc5cc886163d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_10.txt @@ -0,0 +1 @@ +OWL-ST unlocks Web-scale training for open-world localization, similar to what has been seen for image classification and language modelling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..30b9c8f9fcb9b6280197ec44f771ae24a2844323 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_11.txt @@ -0,0 +1 @@ +OWLv2 high-level overview. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_12.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..11ceccac55b22ecedc0c8de75dd3cfa48fe6277b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_15.txt @@ -0,0 +1,2 @@ +Usage example +OWLv2 is, just like its predecessor OWL-ViT, a zero-shot text-conditioned object detection model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b9db8557a8ec1c6fe5a009b5442ef3ce3834256 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_16.txt @@ -0,0 +1 @@ +OWL-ViT uses CLIP as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_17.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6847747c2a4be60e046885cde1cd80d3f91b14bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_17.txt @@ -0,0 +1 @@ +To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_18.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..2976b28cddd8391eac3d7620db2894199ee76cc0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_18.txt @@ -0,0 +1 @@ +Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_19.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..caa873e61ae5d0f1f550e8dfd292c4d6b6978a1e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_19.txt @@ -0,0 +1 @@ +The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_20.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9788713ae13e017528bebe04a3661646b09c8c6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_20.txt @@ -0,0 +1 @@ +One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_21.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..7266954cbe9d7006a1eb73a549dd0f3b4e1577a5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_21.txt @@ -0,0 +1 @@ +[Owlv2ImageProcessor] can be used to resize (or rescale) and normalize images for the model and [CLIPTokenizer] is used to encode the text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_22.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fa73902839f3caa7017f5ce264de315ec4561a8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_22.txt @@ -0,0 +1 @@ +[Owlv2Processor] wraps [Owlv2ImageProcessor] and [CLIPTokenizer] into a single instance to both encode the text and prepare the images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_23.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..f92031d373d2c20bae8a31faf355ef06cd312184 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_23.txt @@ -0,0 +1 @@ +The following example shows how to perform object detection using [Owlv2Processor] and [Owlv2ForObjectDetection]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_24.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9de01d19b8f38fd363b739bbe309818e1844968 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_24.txt @@ -0,0 +1,29 @@ +thon + +import requests +from PIL import Image +import torch +from transformers import Owlv2Processor, Owlv2ForObjectDetection +processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble") +model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble") +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +texts = [["a photo of a cat", "a photo of a dog"]] +inputs = processor(text=texts, images=image, return_tensors="pt") +outputs = model(**inputs) +Target image sizes (height, width) to rescale box predictions [batch_size, 2] +target_sizes = torch.Tensor([image.size[::-1]]) +Convert outputs (bounding boxes and class logits) to Pascal VOC Format (xmin, ymin, xmax, ymax) +results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) +i = 0 # Retrieve predictions for the first image for the corresponding text queries +text = texts[i] +boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] +for box, score, label in zip(boxes, scores, labels): + box = [round(i, 2) for i in box.tolist()] + print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") +Detected a photo of a cat with confidence 0.614 at location [341.67, 17.54, 642.32, 278.51] +Detected a photo of a cat with confidence 0.665 at location [6.75, 38.97, 326.62, 354.85] + +Resources + +A demo notebook on using OWLv2 for zero- and one-shot (image-guided) object detection can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_25.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..08333b8629aba87bcc5c32d906bf76a757a3bcc5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_25.txt @@ -0,0 +1,3 @@ +Zero-shot object detection task guide + +The architecture of OWLv2 is identical to OWL-ViT, however the object detection head now also includes an objectness classifier, which predicts the (query-agnostic) likelihood that a predicted box contains an object (as opposed to background). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_26.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0c761e44368b1241f82dcfd78718b22a49485d4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_26.txt @@ -0,0 +1 @@ +The objectness score can be used to rank or filter predictions independently of text queries. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_27.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d99af31139816306714fec38fffb72ce5fa3080f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_27.txt @@ -0,0 +1 @@ +Usage of OWLv2 is identical to OWL-ViT with a new, updated image processor ([Owlv2ImageProcessor]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_28.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cb1e0293cf7690b2dddc1de45611b3ca27b4398 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_28.txt @@ -0,0 +1,29 @@ +Owlv2Config +[[autodoc]] Owlv2Config + - from_text_vision_configs +Owlv2TextConfig +[[autodoc]] Owlv2TextConfig +Owlv2VisionConfig +[[autodoc]] Owlv2VisionConfig +Owlv2ImageProcessor +[[autodoc]] Owlv2ImageProcessor + - preprocess + - post_process_object_detection + - post_process_image_guided_detection +Owlv2Processor +[[autodoc]] Owlv2Processor +Owlv2Model +[[autodoc]] Owlv2Model + - forward + - get_text_features + - get_image_features +Owlv2TextModel +[[autodoc]] Owlv2TextModel + - forward +Owlv2VisionModel +[[autodoc]] Owlv2VisionModel + - forward +Owlv2ForObjectDetection +[[autodoc]] Owlv2ForObjectDetection + - forward + - image_guided_detection \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlv2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_owlv2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..e237436846868a3a7e77255a1c423bd38182a5b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlv2/chunk_9.txt @@ -0,0 +1 @@ +However, with OWL-ST, we can scale to over 1B examples, yielding further large improvement: With an L/14 architecture, OWL-ST improves AP on LVIS rare classes, for which the model has seen no human box annotations, from 31.2% to 44.6% (43% relative improvement). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_10.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ed968ee76257c1893a97a59bb9dd5ee82547726 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_10.txt @@ -0,0 +1 @@ +OWL-ViT architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_11.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_11.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_12.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..773cf31a6bf61e91bd6c3fb37484f259b8192082 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by adirik. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_13.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_14.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e128241729e5c8b2eeb9db09f69179274452733 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_14.txt @@ -0,0 +1,2 @@ +Usage tips +OWL-ViT is a zero-shot text-conditioned object detection model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_15.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b9db8557a8ec1c6fe5a009b5442ef3ce3834256 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_15.txt @@ -0,0 +1 @@ +OWL-ViT uses CLIP as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_16.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6847747c2a4be60e046885cde1cd80d3f91b14bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_16.txt @@ -0,0 +1 @@ +To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_17.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..2976b28cddd8391eac3d7620db2894199ee76cc0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_17.txt @@ -0,0 +1 @@ +Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_18.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..caa873e61ae5d0f1f550e8dfd292c4d6b6978a1e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_18.txt @@ -0,0 +1 @@ +The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_19.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9788713ae13e017528bebe04a3661646b09c8c6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_19.txt @@ -0,0 +1 @@ +One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_20.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc508488f5a4405edc40dfac1234b57f21101342 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_20.txt @@ -0,0 +1 @@ +[OwlViTImageProcessor] can be used to resize (or rescale) and normalize images for the model and [CLIPTokenizer] is used to encode the text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_21.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..c286b1f4d258e1b81ab729ee366825e9760f9b8b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_21.txt @@ -0,0 +1 @@ +[OwlViTProcessor] wraps [OwlViTImageProcessor] and [CLIPTokenizer] into a single instance to both encode the text and prepare the images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_22.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..567daf6465789821c0b61cf0b90a5dc034607f98 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_22.txt @@ -0,0 +1 @@ +The following example shows how to perform object detection using [OwlViTProcessor] and [OwlViTForObjectDetection]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_23.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..6790c6abd3099beefcf097b32162e697216ea9cd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_23.txt @@ -0,0 +1,28 @@ +thon + +import requests +from PIL import Image +import torch +from transformers import OwlViTProcessor, OwlViTForObjectDetection +processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32") +model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32") +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +texts = [["a photo of a cat", "a photo of a dog"]] +inputs = processor(text=texts, images=image, return_tensors="pt") +outputs = model(**inputs) +Target image sizes (height, width) to rescale box predictions [batch_size, 2] +target_sizes = torch.Tensor([image.size[::-1]]) +Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) +results = processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=0.1) +i = 0 # Retrieve predictions for the first image for the corresponding text queries +text = texts[i] +boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] +for box, score, label in zip(boxes, scores, labels): + box = [round(i, 2) for i in box.tolist()] + print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") +Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.58, 373.29] +Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17] + +Resources +A demo notebook on using OWL-ViT for zero- and one-shot (image-guided) object detection can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_24.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8df53a0e0ea25b6359f57e9097e68a915d1b0b1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_24.txt @@ -0,0 +1,34 @@ +OwlViTConfig +[[autodoc]] OwlViTConfig + - from_text_vision_configs +OwlViTTextConfig +[[autodoc]] OwlViTTextConfig +OwlViTVisionConfig +[[autodoc]] OwlViTVisionConfig +OwlViTImageProcessor +[[autodoc]] OwlViTImageProcessor + - preprocess + - post_process_object_detection + - post_process_image_guided_detection +OwlViTFeatureExtractor +[[autodoc]] OwlViTFeatureExtractor + - call + - post_process + - post_process_image_guided_detection +OwlViTProcessor +[[autodoc]] OwlViTProcessor +OwlViTModel +[[autodoc]] OwlViTModel + - forward + - get_text_features + - get_image_features +OwlViTTextModel +[[autodoc]] OwlViTTextModel + - forward +OwlViTVisionModel +[[autodoc]] OwlViTVisionModel + - forward +OwlViTForObjectDetection +[[autodoc]] OwlViTForObjectDetection + - forward + - image_guided_detection \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_8.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5057b37a70282110039a50c9fd76448c103cf42 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_8.txt @@ -0,0 +1 @@ +We provide the adaptation strategies and regularizations needed to attain very strong performance on zero-shot text-conditioned and one-shot image-conditioned object detection. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_owlvit/chunk_9.txt b/chunked/content_aware_chunking/model_doc_owlvit/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..a13c019d31781634730a990b1b3690c5822cb71b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_owlvit/chunk_9.txt @@ -0,0 +1 @@ +Code and models are available on GitHub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f6ce72186fa35124087a8a094b42e04b8dbfcb0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_10.txt @@ -0,0 +1 @@ +Additionally, a simple gated attention mechanism is introduced in the backbone to prioritize important features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f72cdf4cfd6189bc409d17424df7ccc44fe15cd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_11.txt @@ -0,0 +1 @@ +By incorporating these lightweight components, we significantly enhance the learning capability of simple MLP structures, outperforming complex Transformer models with minimal computing usage. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6e74b6ec54b273b6032de4d9d885d6bc0b4fd71 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_12.txt @@ -0,0 +1 @@ +Moreover, TSMixer's modular design enables compatibility with both supervised and masked self-supervised learning methods, making it a promising building block for time-series Foundation Models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..5eac250bbebd6406aa0df0fc66f8272eff2363cf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_13.txt @@ -0,0 +1 @@ +TSMixer outperforms state-of-the-art MLP and Transformer models in forecasting by a considerable margin of 8-60%. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3514f4e6fd79a907ab88fb5b786f001b67e07af --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_14.txt @@ -0,0 +1 @@ +It also outperforms the latest strong benchmarks of Patch-Transformer models (by 1-2%) with a significant reduction in memory and runtime (2-3X). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3e0b9d1625f88a2c8eb98e4135e6f77d0a752f4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_15.txt @@ -0,0 +1,3 @@ +This model was contributed by ajati, vijaye12, +gsinthong, namctin, +wmgifford, kashif. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c070e46b123968d10e1e6b111b4e5e941dce0e8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_16.txt @@ -0,0 +1,2 @@ +Usage example +The code snippet below shows how to randomly initialize a PatchTSMixer model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_17.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c5d673f69e66d231349f9ae02d4aec1ea0b00fb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_17.txt @@ -0,0 +1 @@ +The model is compatible with the Trainer API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_18.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd393f9234da9ca478b6f092f43cc1717705ecb7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_18.txt @@ -0,0 +1,13 @@ +thon +from transformers import PatchTSMixerConfig, PatchTSMixerForPrediction +from transformers import Trainer, TrainingArguments, +config = PatchTSMixerConfig(context_length = 512, prediction_length = 96) +model = PatchTSMixerForPrediction(config) +trainer = Trainer(model=model, args=training_args, + train_dataset=train_dataset, + eval_dataset=valid_dataset) +trainer.train() +results = trainer.evaluate(test_dataset) + +Usage tips +The model can also be used for time series classification and time series regression. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_19.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..37895903293a0249612914c5df2fa4447999ed6b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_19.txt @@ -0,0 +1 @@ +See the respective [PatchTSMixerForTimeSeriesClassification] and [PatchTSMixerForRegression] classes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_20.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..d82f1a7eb2fcac07e5cbf2333815c5169f830dd5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_20.txt @@ -0,0 +1,3 @@ +Resources + +A blog post explaining PatchTSMixer in depth can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_21.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b90e12b30560c574d5205970ec25c8da8bb3543 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_21.txt @@ -0,0 +1 @@ +The blog can also be opened in Google Colab. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_22.txt b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..443714c5757d5b1cbe4a04edb0ec120d6136d38c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtsmixer/chunk_22.txt @@ -0,0 +1,17 @@ +PatchTSMixerConfig +[[autodoc]] PatchTSMixerConfig +PatchTSMixerModel +[[autodoc]] PatchTSMixerModel + - forward +PatchTSMixerForPrediction +[[autodoc]] PatchTSMixerForPrediction + - forward +PatchTSMixerForTimeSeriesClassification +[[autodoc]] PatchTSMixerForTimeSeriesClassification + - forward +PatchTSMixerForPretraining +[[autodoc]] PatchTSMixerForPretraining + - forward +PatchTSMixerForRegression +[[autodoc]] PatchTSMixerForRegression + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtst/chunk_10.txt b/chunked/content_aware_chunking/model_doc_patchtst/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..56a3816143560e4fba7543370831e3cea89a85b2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtst/chunk_10.txt @@ -0,0 +1,2 @@ +Usage tips +The model can also be used for time series classification and time series regression. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtst/chunk_11.txt b/chunked/content_aware_chunking/model_doc_patchtst/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3c632e33321a1eb424fec94b93532b9cdb7b477 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtst/chunk_11.txt @@ -0,0 +1 @@ +See the respective [PatchTSTForClassification] and [PatchTSTForRegression] classes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtst/chunk_12.txt b/chunked/content_aware_chunking/model_doc_patchtst/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..b11607451a7ee347fcf3f5b6a6c8b14a4bd60225 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtst/chunk_12.txt @@ -0,0 +1,3 @@ +Resources + +A blog post explaining PatchTST in depth can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtst/chunk_13.txt b/chunked/content_aware_chunking/model_doc_patchtst/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b90e12b30560c574d5205970ec25c8da8bb3543 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtst/chunk_13.txt @@ -0,0 +1 @@ +The blog can also be opened in Google Colab. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtst/chunk_14.txt b/chunked/content_aware_chunking/model_doc_patchtst/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..aadfafa7a5ee436b99fdd07652d16494d156f0b1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtst/chunk_14.txt @@ -0,0 +1,17 @@ +PatchTSTConfig +[[autodoc]] PatchTSTConfig +PatchTSTModel +[[autodoc]] PatchTSTModel + - forward +PatchTSTForPrediction +[[autodoc]] PatchTSTForPrediction + - forward +PatchTSTForClassification +[[autodoc]] PatchTSTForClassification + - forward +PatchTSTForPretraining +[[autodoc]] PatchTSTForPretraining + - forward +PatchTSTForRegression +[[autodoc]] PatchTSTForRegression + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtst/chunk_7.txt b/chunked/content_aware_chunking/model_doc_patchtst/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d5a9cf1faf14c318c4b4fbdb01ee9a25f2ab28c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtst/chunk_7.txt @@ -0,0 +1 @@ +Transferring of masked pre-trained representation on one dataset to others also produces SOTA forecasting accuracy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtst/chunk_8.txt b/chunked/content_aware_chunking/model_doc_patchtst/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b123aa1881cc06a622e95a1df2a56651e78e7140 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtst/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by namctin, gsinthong, diepi, vijaye12, wmgifford, and kashif. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_patchtst/chunk_9.txt b/chunked/content_aware_chunking/model_doc_patchtst/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_patchtst/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_10.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..91ce33cb6be86601ebb1057135c4d0c26733b77f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_10.txt @@ -0,0 +1,5 @@ +Checkpoints +All the checkpoints are fine-tuned for summarization, besides +pegasus-large, whence the other checkpoints are fine-tuned: + +Each checkpoint is 2.2 GB on disk and 568M parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_11.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d75602183d996bafd8bbc5f6343c576788b2f52 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_11.txt @@ -0,0 +1 @@ +FP16 is not supported (help/ideas on this appreciated!). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_12.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7fbe0902e897e503e63fb2d44b67ca953c57419 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_12.txt @@ -0,0 +1 @@ +Summarizing xsum in fp32 takes about 400ms/sample, with default parameters on a v100 GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_13.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad388a8bfccff5c888b98ea278c16faf61269c88 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_13.txt @@ -0,0 +1 @@ +Full replication results and correctly pre-processed data can be found in this Issue. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_14.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e27c670f1c030a7ae6f9604b21936977060da94 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_14.txt @@ -0,0 +1 @@ +Distilled checkpoints are described in this paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_15.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..9646b3f2e2d553968a047abf660d077d4bda9929 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_15.txt @@ -0,0 +1,3 @@ +Implementation Notes + +All models are transformer encoder-decoders with 16 layers in each component. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_16.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0b8ef277f0c5308d91a9c27d80206998f83821a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_16.txt @@ -0,0 +1,4 @@ +The implementation is completely inherited from [BartForConditionalGeneration] +Some key configuration differences: +static, sinusoidal position embeddings +the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_17.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a90fb7967bbee40c3fe02121e866a62357e86e0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_17.txt @@ -0,0 +1,3 @@ +more beams are used (num_beams=8) +All pretrained pegasus checkpoints are the same besides three attributes: tokenizer.model_max_length (maximum + input size), max_length (the maximum number of tokens to generate) and length_penalty. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_18.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4f859d52e1080f7ef95a7f72e3d740ed2bbaeb4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_18.txt @@ -0,0 +1,2 @@ +The code to convert checkpoints trained in the author's repo can be + found in convert_pegasus_tf_to_pytorch.py. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_19.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..4747c365df135bade7f05ddd1f1b8e73b6585fa6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_19.txt @@ -0,0 +1,7 @@ +Usage Example +thon + +from transformers import PegasusForConditionalGeneration, PegasusTokenizer +import torch +src_text = [ + """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_20.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d1a7e76bf1ad6e54de61c163b4222161db70496 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_20.txt @@ -0,0 +1 @@ +The aim is to reduce the risk of wildfires. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_21.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..4943be48ce338e6bf71a90739732e7235c962ba7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_21.txt @@ -0,0 +1 @@ +Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow.""" \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_22.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..78b6897050bc54faee8848564e9a57606a5766cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_22.txt @@ -0,0 +1,12 @@ +] + + model_name = "google/pegasus-xsum" + device = "cuda" if torch.cuda.is_available() else "cpu" + tokenizer = PegasusTokenizer.from_pretrained(model_name) + model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device) + batch = tokenizer(src_text, truncation=True, padding="longest", return_tensors="pt").to(device) + translated = model.generate(**batch) + tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True) + assert ( + tgt_text[0] + == "California's largest electricity provider has turned off power to hundreds of thousands of customers." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_23.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..26443a4a7c15ad72b3f3c9f97340972451a47657 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_23.txt @@ -0,0 +1,6 @@ +) + +Resources + +Script to fine-tune pegasus + on the XSUM dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_24.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..80e98614813ef8e53c629364dd47eff4f4facf52 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_24.txt @@ -0,0 +1 @@ +Data download instructions at examples/pytorch/summarization/. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_25.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..871cdc00e5d983e9ea91c920b018d5569dbf0d56 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_25.txt @@ -0,0 +1,8 @@ +Causal language modeling task guide +Translation task guide +Summarization task guide + +PegasusConfig +[[autodoc]] PegasusConfig +PegasusTokenizer +warning: add_tokens does not work at the moment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_26.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..738393500d26b2134cfee08a15faefe3a485e6d2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_26.txt @@ -0,0 +1,31 @@ +[[autodoc]] PegasusTokenizer +PegasusTokenizerFast +[[autodoc]] PegasusTokenizerFast + +PegasusModel +[[autodoc]] PegasusModel + - forward +PegasusForConditionalGeneration +[[autodoc]] PegasusForConditionalGeneration + - forward +PegasusForCausalLM +[[autodoc]] PegasusForCausalLM + - forward + +TFPegasusModel +[[autodoc]] TFPegasusModel + - call +TFPegasusForConditionalGeneration +[[autodoc]] TFPegasusForConditionalGeneration + - call + +FlaxPegasusModel +[[autodoc]] FlaxPegasusModel + - call + - encode + - decode +FlaxPegasusForConditionalGeneration +[[autodoc]] FlaxPegasusForConditionalGeneration + - call + - encode + - decode \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_7.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a05a47688f4201fa33898a1aef4c6702c9fafbb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_7.txt @@ -0,0 +1,3 @@ +MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in BERT) + +GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a causal mask to hide the future words like a regular auto-regressive transformer decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_8.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d75602183d996bafd8bbc5f6343c576788b2f52 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_8.txt @@ -0,0 +1 @@ +FP16 is not supported (help/ideas on this appreciated!). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus/chunk_9.txt b/chunked/content_aware_chunking/model_doc_pegasus/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..2760114d4698d0e0e2d74827db55f8df2a264681 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus/chunk_9.txt @@ -0,0 +1 @@ +The adafactor optimizer is recommended for pegasus fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_10.txt b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec14451045cd0ffd1708539d8a993c5ae91d9ce8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_10.txt @@ -0,0 +1,6 @@ +Documentation resources + +Translation task guide +Summarization task guide + +PEGASUS-X uses the same tokenizer as PEGASUS. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_11.txt b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b4f009436f2b4cfc550186145deeab03da629c4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_11.txt @@ -0,0 +1,8 @@ +PegasusXConfig +[[autodoc]] PegasusXConfig +PegasusXModel +[[autodoc]] PegasusXModel + - forward +PegasusXForConditionalGeneration +[[autodoc]] PegasusXForConditionalGeneration + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_6.txt b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ca60d6d2ab08d0a55975c838c4eb49d7a1ad403 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_6.txt @@ -0,0 +1 @@ +Based on our findings, we introduce PEGASUS-X, an extension of the PEGASUS model with additional long input pretraining to handle inputs of up to 16K tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_7.txt b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6d7bdc78dfb43e20f4e72302806ec21c3fe418b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_7.txt @@ -0,0 +1 @@ +PEGASUS-X achieves strong performance on long input summarization tasks comparable with much larger models while adding few additional parameters and not requiring model parallelism to train. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_8.txt b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..60d0848d7d1aacc49683d293e0f385731ed0b2e2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by zphang. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_9.txt b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pegasus_x/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_15.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc0d0a9e3f39df5c3eb1bdc90d844b436f6d28eb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_15.txt @@ -0,0 +1,2 @@ +Perceiver aims to solve this issue by, instead of performing self-attention on the inputs, perform it on a set +of latent variables, and only use the inputs for cross-attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_16.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..c77d12fb486b579557ee4edc375d323c03428196 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_16.txt @@ -0,0 +1,2 @@ +In this way, the time and memory requirements don't +depend on the length of the inputs anymore, as one uses a fixed amount of latent variables, like 256 or 512. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_17.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea294ca1babc45304130ea23693f33594dd57a7d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_17.txt @@ -0,0 +1,2 @@ +These are +randomly initialized, after which they are trained end-to-end using backpropagation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_18.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c44bbdf5e423bae64b4d83a25c80fae40bc1a70 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_18.txt @@ -0,0 +1,2 @@ +Internally, [PerceiverModel] will create the latents, which is a tensor of shape (batch_size, num_latents, +d_latents). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_19.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9899e710168d02d99d2a2187103828944327c60 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_19.txt @@ -0,0 +1 @@ +One must provide inputs (which could be text, images, audio, you name it!) \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_20.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1ddc874e3de10520ebe39d8c6dcb4d3ba218593 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_20.txt @@ -0,0 +1,2 @@ +to the model, which it will +use to perform cross-attention with the latents. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_21.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..0760e755e81fd945cad82ecfb6dab9f0d34faf87 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_21.txt @@ -0,0 +1 @@ +The output of the Perceiver encoder is a tensor of the same shape. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_22.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8f59fa1db923546c7b8b3730186a98ba4de8a1b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_22.txt @@ -0,0 +1,3 @@ +One +can then, similar to BERT, convert the last hidden states of the latents to classification logits by averaging along +the sequence dimension, and placing a linear layer on top of that to project the d_latents to num_labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_23.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2d637aede0be8d3b70779490c116549e5b9583f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_23.txt @@ -0,0 +1 @@ +This was the idea of the original Perceiver paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_24.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c514758b7638e29e127a67d3cbd50d30ec46f3d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_24.txt @@ -0,0 +1 @@ +However, it could only output classification logits. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_25.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..198017a89fdc1d323968824539ccb332063cb9f2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_25.txt @@ -0,0 +1,2 @@ +In a follow-up +work, PerceiverIO, they generalized it to let the model also produce outputs of arbitrary size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_26.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..ced0a491eb5a3259944802f00786ab522c663eae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_26.txt @@ -0,0 +1 @@ +How, you might ask? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_27.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..b380b22e5927f64909af63d9bcf51b5e1f6eb37d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_27.txt @@ -0,0 +1,3 @@ +The +idea is actually relatively simple: one defines outputs of an arbitrary size, and then applies cross-attention with the +last hidden states of the latents, using the outputs as queries, and the latents as keys and values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_28.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..d20879b0fb88ffd4683d0713cc225d7af0acfe1d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_28.txt @@ -0,0 +1 @@ +So let's say one wants to perform masked language modeling (BERT-style) with the Perceiver. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_29.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbacd9cdce92645658302a461eb270aee8de45f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_29.txt @@ -0,0 +1,3 @@ +As the Perceiver's input +length will not have an impact on the computation time of the self-attention layers, one can provide raw bytes, +providing inputs of length 2048 to the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_30.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..11629f8e03cbb2568acd38302c1977d8ee35f47c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_30.txt @@ -0,0 +1,2 @@ +If one now masks out certain of these 2048 tokens, one can define the +outputs as being of shape: (batch_size, 2048, 768). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_31.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..299ac6b03b9f8c2451e89da50a3594d9f0b1d1a6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_31.txt @@ -0,0 +1,2 @@ +Next, one performs cross-attention with the final hidden states +of the latents to update the outputs tensor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_32.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..712bd34c87603f2e8ed15a1e08269fb84a13e713 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_32.txt @@ -0,0 +1,2 @@ +After cross-attention, one still has a tensor of shape (batch_size, +2048, 768). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_33.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b6ee09344685fb5c00d52759d58b25d9bd12375 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_33.txt @@ -0,0 +1,2 @@ +One can then place a regular language modeling head on top, to project the last dimension to the +vocabulary size of the model, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_34.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d2ecb3cae153c123dac2a4f2fd35505ec2cbdb7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_34.txt @@ -0,0 +1,2 @@ +creating logits of shape (batch_size, 2048, 262) (as Perceiver uses a vocabulary +size of 262 byte IDs). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_35.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..10c7e5e188c706b980f45cd95b8b41f83faf3207 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_35.txt @@ -0,0 +1 @@ +Perceiver IO architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_36.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..b36ba0aaa042af8815009fab38de1aa294181ec1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_36.txt @@ -0,0 +1,2 @@ +Taken from the original paper +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_37.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f16a20874db68468d36337353044b26ede99569 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_37.txt @@ -0,0 +1,2 @@ +The original code can be found +here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_38.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd74144f6b698127504f45b3385cabb23ec2d3bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_38.txt @@ -0,0 +1,6 @@ +Perceiver does not work with torch.nn.DataParallel due to a bug in PyTorch, see issue #36035 + +Resources + +The quickest way to get started with the Perceiver is by checking the tutorial + notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_39.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce567f915971e7783f142ae219b06c2348d0ae71 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_39.txt @@ -0,0 +1,2 @@ +Refer to the blog post if you want to fully understand how the model works and +is implemented in the library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_40.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..f80074762456bbc9d28447075a5f94300f0d9af7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_40.txt @@ -0,0 +1,2 @@ +Note that the models available in the library only showcase some examples of what you can do +with the Perceiver. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_41.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..38ead45be701c582d90a6823ba39b86a839b2dbf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_41.txt @@ -0,0 +1,2 @@ +There are many more use cases, including question answering, named-entity recognition, object detection, +audio classification, video classification, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_perceiver/chunk_42.txt b/chunked/content_aware_chunking/model_doc_perceiver/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9c2bda2bcfb84e3b01f4b85df72f22ace7e0845 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_perceiver/chunk_42.txt @@ -0,0 +1,74 @@ +Text classification task guide +Masked language modeling task guide +Image classification task guide + +Perceiver specific outputs +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverModelOutput +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverDecoderOutput +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMaskedLMOutput +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassifierOutput +PerceiverConfig +[[autodoc]] PerceiverConfig +PerceiverTokenizer +[[autodoc]] PerceiverTokenizer + - call +PerceiverFeatureExtractor +[[autodoc]] PerceiverFeatureExtractor + - call +PerceiverImageProcessor +[[autodoc]] PerceiverImageProcessor + - preprocess +PerceiverTextPreprocessor +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverTextPreprocessor +PerceiverImagePreprocessor +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverImagePreprocessor +PerceiverOneHotPreprocessor +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverOneHotPreprocessor +PerceiverAudioPreprocessor +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverAudioPreprocessor +PerceiverMultimodalPreprocessor +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalPreprocessor +PerceiverProjectionDecoder +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverProjectionDecoder +PerceiverBasicDecoder +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverBasicDecoder +PerceiverClassificationDecoder +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassificationDecoder +PerceiverOpticalFlowDecoder +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverOpticalFlowDecoder +PerceiverBasicVideoAutoencodingDecoder +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverBasicVideoAutoencodingDecoder +PerceiverMultimodalDecoder +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalDecoder +PerceiverProjectionPostprocessor +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverProjectionPostprocessor +PerceiverAudioPostprocessor +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverAudioPostprocessor +PerceiverClassificationPostprocessor +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverClassificationPostprocessor +PerceiverMultimodalPostprocessor +[[autodoc]] models.perceiver.modeling_perceiver.PerceiverMultimodalPostprocessor +PerceiverModel +[[autodoc]] PerceiverModel + - forward +PerceiverForMaskedLM +[[autodoc]] PerceiverForMaskedLM + - forward +PerceiverForSequenceClassification +[[autodoc]] PerceiverForSequenceClassification + - forward +PerceiverForImageClassificationLearned +[[autodoc]] PerceiverForImageClassificationLearned + - forward +PerceiverForImageClassificationFourier +[[autodoc]] PerceiverForImageClassificationFourier + - forward +PerceiverForImageClassificationConvProcessing +[[autodoc]] PerceiverForImageClassificationConvProcessing + - forward +PerceiverForOpticalFlow +[[autodoc]] PerceiverForOpticalFlow + - forward +PerceiverForMultimodalAutoencoding +[[autodoc]] PerceiverForMultimodalAutoencoding + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_10.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e15f2d3109d5e8db96d735494bdeeaad97e68f8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by ArthurZ. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_11.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_12.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b873ac797598d1eb197ae8464cd656998193601 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_12.txt @@ -0,0 +1,4 @@ +Usage tips + +The Persimmon models were trained using bfloat16, but the original inference uses float16 The checkpoints uploaded on the hub use torch_dtype = 'float16' which will be +used by the AutoModel API to cast the checkpoints from torch.float32 to torch.float16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_13.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..e447e491e18ef028d5d033e89bb3a101c13846ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_13.txt @@ -0,0 +1 @@ +The dtype of the online weights is mostly irrelevant, unless you are using torch_dtype="auto" when initializing a model using model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto"). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_14.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..006237698db1f67b611969f034511e46323e11c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_14.txt @@ -0,0 +1 @@ +The reason is that the model will first be downloaded ( using the dtype of the checkpoints online) then it will be cast to the default dtype of torch (becomes torch.float32). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_15.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..2eeef4585cfffde0c5ccc67ebd2e03e9671e723f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_15.txt @@ -0,0 +1 @@ +Users should specify the torch_dtype they want, and if they don't it will be torch.float32. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_16.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..28665b877cc7abd6c33c04bed114fb2261876ed9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_16.txt @@ -0,0 +1 @@ +Finetuning the model in float16 is not recommended and known to produce nan, as such the model should be fine-tuned in bfloat16. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_17.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ed7d1be3a60fa9fb978ec31ddd7fde70109911b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_17.txt @@ -0,0 +1,21 @@ +Tips: + +To convert the model, you need to clone the original repository using git clone https://github.com/persimmon-ai-labs/adept-inference, then get the checkpoints: + +git clone https://github.com/persimmon-ai-labs/adept-inference +wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar +tar -xvf 8b_base_model_release.tar +python src/transformers/models/persimmon/convert_persimmon_weights_to_hf.py --input_dir /path/to/downloaded/persimmon/weights/ --output_dir /output/path \ + --pt_model_path /path/to/8b_chat_model_release/iter_0001251/mp_rank_00/model_optim_rng.pt + --ada_lib_path /path/to/adept-inference +For the chat model: + +wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar +tar -xvf 8b_base_model_release.tar +Thereafter, models can be loaded via: + +from transformers import PersimmonForCausalLM, PersimmonTokenizer +model = PersimmonForCausalLM.from_pretrained("/output/path") +tokenizer = PersimmonTokenizer.from_pretrained("/output/path") + +Perismmon uses a sentencepiece based tokenizer, with a Unigram model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_18.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c343f5896df65be06c7f1380084581ddfa9244c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_18.txt @@ -0,0 +1 @@ +It supports bytefallback, which is only available in tokenizers==0.14.0 for the fast tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_19.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f136186a061869af864a7682b8586eb013b1455 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_19.txt @@ -0,0 +1 @@ +The LlamaTokenizer is used as it is a standard wrapper around sentencepiece. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_20.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..32b14a96342575a184bfb50c6e6aaf1d6b7a2113 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_20.txt @@ -0,0 +1 @@ +The chat template will be updated with the templating functions in a follow up PR! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_21.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..bec866d22ec1624aec5b7b060e924fd0f9c04c6f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_21.txt @@ -0,0 +1,13 @@ +The authors suggest to use the following prompt format for the chat mode: f"human: {prompt}\n\nadept:" + +PersimmonConfig +[[autodoc]] PersimmonConfig +PersimmonModel +[[autodoc]] PersimmonModel + - forward +PersimmonForCausalLM +[[autodoc]] PersimmonForCausalLM + - forward +PersimmonForSequenceClassification +[[autodoc]] PersimmonForSequenceClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_8.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..029ab37b68cf7c62a45da593a717286b0c273c00 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_8.txt @@ -0,0 +1 @@ +The authors present a fast inference code that outperforms traditional implementations through operator fusion and CUDA graph utilization while maintaining code coherence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_persimmon/chunk_9.txt b/chunked/content_aware_chunking/model_doc_persimmon/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..da4a0edf2277fcb183ce05e613400c40c081d474 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_persimmon/chunk_9.txt @@ -0,0 +1 @@ +They express their anticipation of how the community will leverage this contribution to drive innovation, hinting at further upcoming releases as part of an ongoing series of developments. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_14.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bf8af3a8918d7277ba99cfb39dad0f295f608f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_14.txt @@ -0,0 +1,4 @@ +More generally, phi-1.5 exhibits many of the traits of much larger LLMs, both good –such +as the ability to “think step by step†or perform some rudimentary in-context learning– and bad, +including hallucinations and the potential for toxic and biased generations –encouragingly though, we +are seeing improvement on that front thanks to the absence of web data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_15.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..a71f8923bf1e7cc402c70da1732ffebf1ec12650 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_15.txt @@ -0,0 +1,2 @@ +We open-source phi-1.5 to +promote further research on these urgent topics. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_16.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..da25758841c5a2df6576b576e5390e7ad79a3e28 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_16.txt @@ -0,0 +1 @@ +This model was contributed by Susnato Dhar. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_17.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc65ff9afacc96919ce4c360523e08e6f0daf48f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_17.txt @@ -0,0 +1 @@ +The original code for Phi-1, Phi-1.5 and Phi-2 can be found here, here and here, respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_18.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..95e79a41e2356c5ffa4b72177aa7b5dd3c32f92d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_18.txt @@ -0,0 +1,3 @@ +Usage tips + +This model is quite similar to Llama with the main difference in [PhiDecoderLayer], where they used [PhiAttention] and [PhiMLP] layers in parallel configuration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_19.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..eab9489e4f87a5e5581d6b26d5477d43ac273e38 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_19.txt @@ -0,0 +1 @@ +The tokenizer used for this model is identical to the [CodeGenTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_20.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..62bd0f77ddae5e4e6cef99b47b26483d4fdb95ba --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_20.txt @@ -0,0 +1,3 @@ +How to use Phi-2 + +Phi-2 has been integrated in the development version (4.37.0.dev) of transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_21.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..76a9be789af6da52b67aab92f71a242bc0401ee7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_21.txt @@ -0,0 +1,3 @@ +Until the official version is released through pip, ensure that you are doing one of the following: + +When loading the model, ensure that trust_remote_code=True is passed as an argument of the from_pretrained() function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_22.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b1c1edcb4a687ba151609fd3eaca57b3812f3b9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_22.txt @@ -0,0 +1 @@ +Update your local transformers to the development version: pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_23.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..8980a7964e9c8d674d01ffed7a7436b6b62b183e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_23.txt @@ -0,0 +1 @@ +The previous command is an alternative to cloning and installing from the source. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_24.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..5be0507c4dd694098690a4a639e9900602641a80 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_24.txt @@ -0,0 +1,6 @@ +thon + +from transformers import AutoModelForCausalLM, AutoTokenizer +model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2") +tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2") +inputs = tokenizer('Can you help me write a formal email to a potential business partner proposing a joint venture? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_25.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..818986910f3655849c4d2d7b8cfa0769923905da --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_25.txt @@ -0,0 +1,11 @@ +', return_tensors="pt", return_attention_mask=False) +outputs = model.generate(**inputs, max_length=30) +text = tokenizer.batch_decode(outputs)[0] +print(text) +'Can you help me write a formal email to a potential business partner proposing a joint venture?\nInput: Company A: ABC Inc.\nCompany B: XYZ Ltd.\nJoint Venture: A new online platform for e-commerce' + +Example : +thon + +from transformers import PhiForCausalLM, AutoTokenizer +define the model and tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_26.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0dc1f9bb93952bb124c810570790c6f86be7e2b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_26.txt @@ -0,0 +1,3 @@ +model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5") +tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5") +feel free to change the prompt to your liking. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_27.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..42eae5d0564fff32018d65ee3386b7a045b2e4e9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_27.txt @@ -0,0 +1,2 @@ +prompt = "If I were an AI that had just achieved" +apply the tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_28.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..3511efd3bdb48c92bd8d2c5d3eeec7fbde5aa5c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_28.txt @@ -0,0 +1,2 @@ +tokens = tokenizer(prompt, return_tensors="pt") +use the model to generate new tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_29.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..c489c2e66d4640fecaf506a092b4ecb0d6f1539d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_29.txt @@ -0,0 +1,6 @@ +generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10) +tokenizer.batch_decode(generated_output)[0] +'If I were an AI that had just achieved a breakthrough in machine learning, I would be thrilled' + +Combining Phi and Flash Attention 2 +First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_30.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..43c8befe42aecbcd19b04130e166207c73a46264 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_30.txt @@ -0,0 +1,2 @@ +pip install -U flash-attn --no-build-isolation +Make also sure that you have a hardware that is compatible with Flash-Attention 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_31.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f54478ededdb4c998599671bfd287599f84cc76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_31.txt @@ -0,0 +1 @@ +Read more about it in the official documentation of flash-attn repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_32.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ef85380a7fd818d955330819ccf432ff686d273 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_32.txt @@ -0,0 +1 @@ +Make also sure to load your model in half-precision (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_33.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..a50ccf7d56eb4da7e8484a5ae8c30af44da14d88 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_33.txt @@ -0,0 +1,7 @@ +`torch.float16``) +To load and run a model using Flash Attention 2, refer to the snippet below: +thon + +import torch +from transformers import PhiForCausalLM, AutoTokenizer +define the model and tokenizer and push the model and tokens to the GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_34.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cc45ae83c50f86402cf45756c8fe3a014e7cdd3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_34.txt @@ -0,0 +1,3 @@ +model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to("cuda") +tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5") +feel free to change the prompt to your liking. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_35.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..42eae5d0564fff32018d65ee3386b7a045b2e4e9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_35.txt @@ -0,0 +1,2 @@ +prompt = "If I were an AI that had just achieved" +apply the tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_36.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..851bf9406ccf1b395893c8d2a0b62d51400e2fbb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_36.txt @@ -0,0 +1,2 @@ +tokens = tokenizer(prompt, return_tensors="pt").to("cuda") +use the model to generate new tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_37.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..da1cfce3d1d6cbd490ef1bee2402abd9660a4dd6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_37.txt @@ -0,0 +1,6 @@ +generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10) +tokenizer.batch_decode(generated_output)[0] +'If I were an AI that had just achieved a breakthrough in machine learning, I would be thrilled' + +Expected speedups +Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using microsoft/phi-1 checkpoint and the Flash Attention 2 version of the model using a sequence length of 2048. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phi/chunk_38.txt b/chunked/content_aware_chunking/model_doc_phi/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..118194a0b2ec17727b2a5d9de5c4add940f7b66f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phi/chunk_38.txt @@ -0,0 +1,16 @@ +PhiConfig +[[autodoc]] PhiConfig + +PhiModel +[[autodoc]] PhiModel + - forward +PhiForCausalLM +[[autodoc]] PhiForCausalLM + - forward + - generate +PhiForSequenceClassification +[[autodoc]] PhiForSequenceClassification + - forward +PhiForTokenClassification +[[autodoc]] PhiForTokenClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phobert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_phobert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..941ca30196fd9e6f93bfd0827590a8d68a65710e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phobert/chunk_10.txt @@ -0,0 +1,2 @@ +PhobertTokenizer +[[autodoc]] PhobertTokenizer \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phobert/chunk_4.txt b/chunked/content_aware_chunking/model_doc_phobert/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phobert/chunk_4.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phobert/chunk_5.txt b/chunked/content_aware_chunking/model_doc_phobert/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..193fa6c888bfe6e7f806cea8298690e6376c46b2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phobert/chunk_5.txt @@ -0,0 +1,8 @@ +Usage example +thon + +import torch +from transformers import AutoModel, AutoTokenizer +phobert = AutoModel.from_pretrained("vinai/phobert-base") +tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base") +INPUT TEXT MUST BE ALREADY WORD-SEGMENTED! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phobert/chunk_6.txt b/chunked/content_aware_chunking/model_doc_phobert/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..63cf2581d7c8675419743083fdbb04eef2105c74 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phobert/chunk_6.txt @@ -0,0 +1 @@ +line = "Tôi là sinh_viên trÆ°á»ng đại_há»c Công_nghệ ." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phobert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_phobert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..40bd95963c844dac5b3830c117e34624596db649 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phobert/chunk_7.txt @@ -0,0 +1,9 @@ +input_ids = torch.tensor([tokenizer.encode(line)]) +with torch.no_grad(): + features = phobert(input_ids) # Models outputs are now tuples +With TensorFlow 2.0+: +from transformers import TFAutoModel +phobert = TFAutoModel.from_pretrained("vinai/phobert-base") + + +PhoBERT implementation is the same as BERT, except for tokenization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phobert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_phobert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..85c161e84396eced100fd8faddff330a1938cb11 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phobert/chunk_8.txt @@ -0,0 +1,2 @@ +Refer to EART documentation for information on +configuration classes and their parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_phobert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_phobert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e1f1ed5e1c81b03a8bd3744979aea758c711001 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_phobert/chunk_9.txt @@ -0,0 +1 @@ +PhoBERT-specific tokenizer is documented below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pix2struct/chunk_10.txt b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..373f93b75445c54a111b5c5465218dd61b35e637 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_10.txt @@ -0,0 +1 @@ +The full list can be found in Table 1 of the paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pix2struct/chunk_11.txt b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f7f82f8c6f022c9019716cbcca566437b8aa953 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_11.txt @@ -0,0 +1 @@ +We therefore advise you to use these models for the tasks they have been fine tuned on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pix2struct/chunk_12.txt b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..542deecd4fd238489c5d9610b5b51a18b761eb2c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_12.txt @@ -0,0 +1 @@ +For instance, if you want to use Pix2Struct for UI captioning, you should use the model fine tuned on the UI dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pix2struct/chunk_13.txt b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6ca49747597bb12986439dee1114202c0071bcd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_13.txt @@ -0,0 +1 @@ +If you want to use Pix2Struct for image captioning, you should use the model fine tuned on the natural images captioning dataset and so on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pix2struct/chunk_14.txt b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3985949786d70ddb10b25426b0744012cffa277 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_14.txt @@ -0,0 +1 @@ +If you want to use the model to perform conditional text captioning, make sure to use the processor with add_special_tokens=False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pix2struct/chunk_15.txt b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..36b88f6b94f0e7b9f14885823b777caa6d5a0d5d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_15.txt @@ -0,0 +1 @@ +This model was contributed by ybelkada. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pix2struct/chunk_16.txt b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_16.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pix2struct/chunk_17.txt b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3f4b4997779368d2cc41364088c90e57f25c383 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_17.txt @@ -0,0 +1,26 @@ +Resources + +Fine-tuning Notebook +All models + +Pix2StructConfig +[[autodoc]] Pix2StructConfig + - from_text_vision_configs +Pix2StructTextConfig +[[autodoc]] Pix2StructTextConfig +Pix2StructVisionConfig +[[autodoc]] Pix2StructVisionConfig +Pix2StructProcessor +[[autodoc]] Pix2StructProcessor +Pix2StructImageProcessor +[[autodoc]] Pix2StructImageProcessor + - preprocess +Pix2StructTextModel +[[autodoc]] Pix2StructTextModel + - forward +Pix2StructVisionModel +[[autodoc]] Pix2StructVisionModel + - forward +Pix2StructForConditionalGeneration +[[autodoc]] Pix2StructForConditionalGeneration + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pix2struct/chunk_7.txt b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..db07ecbc0d3c0bcd81daeb11986136ee7575853b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_7.txt @@ -0,0 +1 @@ +In addition to the novel pretraining strategy, we introduce a variable-resolution input representation and a more flexible integration of language and vision inputs, where language prompts such as questions are rendered directly on top of the input image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pix2struct/chunk_8.txt b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..44fca5ae240f31fd2cc86c0d8e7e168491d22d15 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_8.txt @@ -0,0 +1 @@ +For the first time, we show that a single pretrained model can achieve state-of-the-art results in six out of nine tasks across four domains: documents, illustrations, user interfaces, and natural images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pix2struct/chunk_9.txt b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..96b90c9190241c0e812f96feaef91a01015f9e15 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pix2struct/chunk_9.txt @@ -0,0 +1,2 @@ +Tips: +Pix2Struct has been fine tuned on a variety of tasks and datasets, ranging from image captioning, visual question answering (VQA) over different inputs (books, charts, science diagrams), captioning UI components etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_10.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c3d0926c3557f620bec98b6a4bd85cc9cc7733a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_10.txt @@ -0,0 +1 @@ +The Authors' code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_11.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..f339a81fa305af810c892758f131c360de9628ca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_11.txt @@ -0,0 +1,2 @@ +Usage examples +PLBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for code-to-text, text-to-code, code-to-code tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_12.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..338e95b2868d8eaaa202455c1943ba01677ab515 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_12.txt @@ -0,0 +1,2 @@ +As the +model is multilingual it expects the sequences in a different format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_13.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..db9245ca2989ec0fb759b1492b3ec29641170bf4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_13.txt @@ -0,0 +1,2 @@ +A special language id token is added in both the +source and target text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_14.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..87c29dadf7318363938336c0a9028421c5cc3f18 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_14.txt @@ -0,0 +1 @@ +The source text format is X [eos, src_lang_code] where X is the source text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_15.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..38691b06124e2c101e8ea332a0c4fd927d188a2f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_15.txt @@ -0,0 +1,2 @@ +The +target text format is [tgt_lang_code] X [eos]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_16.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a35ed805ce3cd7e5960410873ecb9896dd4eb06 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_16.txt @@ -0,0 +1 @@ +bos is never used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_17.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..9eea6f10355a153f10b446943c290c10f96c5bf9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_17.txt @@ -0,0 +1 @@ +However, for fine-tuning, in some cases no language token is provided in cases where a single language is used. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_18.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..34ac65053e28ee7aebb9d5a29416587fa7f5ef6e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_18.txt @@ -0,0 +1 @@ +Please refer to the paper to learn more about this. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_19.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..0950e5766c4ecf5b5b795d0ee651760f679521cf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_19.txt @@ -0,0 +1,3 @@ +In cases where the language code is needed, the regular [~PLBartTokenizer.__call__] will encode source text format +when you pass texts as the first argument or with the keyword argument text, and will encode target text format if +it's passed with the text_target keyword argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_20.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a6821fbbc27bfe3c92af6fe54fbb983eaec6649 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_20.txt @@ -0,0 +1,12 @@ +Supervised training +thon + +from transformers import PLBartForConditionalGeneration, PLBartTokenizer +tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-base", src_lang="en_XX", tgt_lang="python") +example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])" +expected_translation_english = "Returns the maximum value of a b c." +inputs = tokenizer(example_python_phrase, text_target=expected_translation_english, return_tensors="pt") +model(**inputs) + +Generation +While generating the target text set the decoder_start_token_id to the target language id. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_21.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..97d968dfd665817ab4a5f90b2a56ac6c404af577 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_21.txt @@ -0,0 +1,2 @@ +The following + example shows how to translate Python to English using the uclanlp/plbart-python-en_XX model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_22.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..4adaca6bc1d3a66f549f5f832081be89df8c477b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_22.txt @@ -0,0 +1,35 @@ +thon + +from transformers import PLBartForConditionalGeneration, PLBartTokenizer +tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="en_XX") +example_python_phrase = "def maximum(a,b,c):NEW_LINE_INDENTreturn max([a,b,c])" +inputs = tokenizer(example_python_phrase, return_tensors="pt") +model = PLBartForConditionalGeneration.from_pretrained("uclanlp/plbart-python-en_XX") +translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["en_XX"]) +tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0] +"Returns the maximum value of a b c." + +Resources + +Text classification task guide +Causal language modeling task guide +Translation task guide +Summarization task guide + +PLBartConfig +[[autodoc]] PLBartConfig +PLBartTokenizer +[[autodoc]] PLBartTokenizer + - build_inputs_with_special_tokens +PLBartModel +[[autodoc]] PLBartModel + - forward +PLBartForConditionalGeneration +[[autodoc]] PLBartForConditionalGeneration + - forward +PLBartForSequenceClassification +[[autodoc]] PLBartForSequenceClassification + - forward +PLBartForCausalLM +[[autodoc]] PLBartForCausalLM + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_8.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e721670fb9c35241f03a071dc3a58790f468e9a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_8.txt @@ -0,0 +1,3 @@ +Furthermore, analysis reveals that PLBART learns program syntax, style (e.g., identifier naming convention), logical flow +(e.g., if block inside an else block is equivalent to else if block) that are crucial to program semantics and thus excels +even with limited annotations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_plbart/chunk_9.txt b/chunked/content_aware_chunking/model_doc_plbart/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..4437f404c70b3ed51882313dfac05f543c00533f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_plbart/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by gchhablani. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d74b66fffefe568a5b86675c597d03b477c003c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_10.txt @@ -0,0 +1 @@ +Based on the extensive experiments, we argue that MetaFormer is the key player in achieving superior results for recent transformer and MLP-like models on vision tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a2db83982632c64f3b13d0541c4398fcacaf9ea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_11.txt @@ -0,0 +1 @@ +This work calls for more future research dedicated to improving MetaFormer instead of focusing on the token mixer modules. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..1499e51811202ebf12e6d7473e7ad023c69d458c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_12.txt @@ -0,0 +1 @@ +Additionally, our proposed PoolFormer could serve as a starting baseline for future MetaFormer architecture design. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff6b05071d964eef2524d694c5f4ef7da1a11d1d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_13.txt @@ -0,0 +1 @@ +The figure below illustrates the architecture of PoolFormer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_14.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..f490a08e250ef78a943124865d05d7c09716f896 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_15.txt @@ -0,0 +1 @@ +This model was contributed by heytanay. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_16.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_17.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..69eaec4cb59e16f700c2a43219563f6b95ec1374 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_17.txt @@ -0,0 +1,3 @@ +Usage tips + +PoolFormer has a hierarchical architecture, where instead of Attention, a simple Average Pooling layer is present. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_18.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..6035e61573f071a0210d58b09baf737351f8527a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_18.txt @@ -0,0 +1 @@ +All checkpoints of the model can be found on the hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_19.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ac9b6c8ca12e7dcc5639457a531f125d475f048 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_19.txt @@ -0,0 +1 @@ +One can use [PoolFormerImageProcessor] to prepare images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_20.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f131c97911495ebe341cf532166e5b855617eac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_20.txt @@ -0,0 +1 @@ +As most models, PoolFormer comes in different sizes, the details of which can be found in the table below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_21.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..849f7e115756a4a5d468edf42bf7d93b2dbda34c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_21.txt @@ -0,0 +1,9 @@ +| Model variant | Depths | Hidden sizes | Params (M) | ImageNet-1k Top 1 | +| :---------------: | ------------- | ------------------- | :------------: | :-------------------: | +| s12 | [2, 2, 6, 2] | [64, 128, 320, 512] | 12 | 77.2 | +| s24 | [4, 4, 12, 4] | [64, 128, 320, 512] | 21 | 80.3 | +| s36 | [6, 6, 18, 6] | [64, 128, 320, 512] | 31 | 81.4 | +| m36 | [6, 6, 18, 6] | [96, 192, 384, 768] | 56 | 82.1 | +| m48 | [8, 8, 24, 8] | [96, 192, 384, 768] | 73 | 82.5 | +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with PoolFormer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_22.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..10db2fb5aadd031d231617b19d480b4be8b54fb1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_22.txt @@ -0,0 +1 @@ +[PoolFormerForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_23.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_23.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_24.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_24.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_25.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..db720afa1d9e5644742a90ea68b1fedf5775c91f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_25.txt @@ -0,0 +1,14 @@ +PoolFormerConfig +[[autodoc]] PoolFormerConfig +PoolFormerFeatureExtractor +[[autodoc]] PoolFormerFeatureExtractor + - call +PoolFormerImageProcessor +[[autodoc]] PoolFormerImageProcessor + - preprocess +PoolFormerModel +[[autodoc]] PoolFormerModel + - forward +PoolFormerForImageClassification +[[autodoc]] PoolFormerForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5dfac5c8bc7d2f3c139d7162041d63df77f9c2f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_7.txt @@ -0,0 +1 @@ +Surprisingly, we observe that the derived model, termed as PoolFormer, achieves competitive performance on multiple computer vision tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..7033f05d95a13a20214325b7d2c256a4aa31a164 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_8.txt @@ -0,0 +1 @@ +For example, on ImageNet-1K, PoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned vision transformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy with 35%/52% fewer parameters and 48%/60% fewer MACs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_poolformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_poolformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b97c3bb3530b06f4f3a7672c3270cb3a8558103 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_poolformer/chunk_9.txt @@ -0,0 +1 @@ +The effectiveness of PoolFormer verifies our hypothesis and urges us to initiate the concept of "MetaFormer", a general architecture abstracted from transformers without specifying the token mixer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_10.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..59a73dc7cee105cf69ebbfac842d1bd5ac2af18b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_10.txt @@ -0,0 +1,2 @@ +The abstract from the paper is the following: +Piano covers of pop music are enjoyed by many people. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_11.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..427a869ff73e7ebe638271b29e4545d8e2a7ed1d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_11.txt @@ -0,0 +1,3 @@ +However, the +task of automatically generating piano covers of pop music is still +understudied. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_12.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..5930b013235481c77f02f68a4262d29929b1b9b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_12.txt @@ -0,0 +1,3 @@ +This is partly due to the lack of synchronized +{Pop, Piano Cover} data pairs, which made it challenging to apply +the latest data-intensive deep learning-based methods. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_13.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..73ee0c4dd092776d7fb6b80759a9090230ea8188 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_13.txt @@ -0,0 +1,4 @@ +To leverage +the power of the data-driven approach, we make a large amount of +paired and synchronized {Pop, Piano Cover} data using an automated +pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_14.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc938a59b58b6cb176f54f4d03cf03e9dfe6a9a2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_14.txt @@ -0,0 +1,2 @@ +In this paper, we present Pop2Piano, a Transformer network +that generates piano covers given waveforms of pop music. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_15.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..740e6147b469127b8dc8e2d9b532eac1760b7a21 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_15.txt @@ -0,0 +1,4 @@ +To the best +of our knowledge, this is the first model to generate a piano cover +directly from pop audio without using melody and chord extraction +modules. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_16.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..1041915832a961124771fb70cd37d3a58ff5db23 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_16.txt @@ -0,0 +1,2 @@ +We show that Pop2Piano, trained with our dataset, is capable +of producing plausible piano covers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_17.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..da25758841c5a2df6576b576e5390e7ad79a3e28 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_17.txt @@ -0,0 +1 @@ +This model was contributed by Susnato Dhar. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_18.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_18.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_19.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..3133f3cb9254af34636c31c632ffa6c6f6a803ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_19.txt @@ -0,0 +1,6 @@ +Usage tips + +To use Pop2Piano, you will need to install the 🤗 Transformers library, as well as the following third party modules: + +pip install pretty-midi==0.2.9 essentia==2.1b6.dev1034 librosa scipy +Please note that you may need to restart your runtime after installation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_20.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9e5fb516744f5ba827e802fac2cc8015ac461f3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_20.txt @@ -0,0 +1 @@ +Pop2Piano is an Encoder-Decoder based model like T5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_21.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b25b2a0b09059bbdc650a244fb19ce820da7fb8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_21.txt @@ -0,0 +1 @@ +Pop2Piano can be used to generate midi-audio files for a given audio sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_22.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d9ab61d9b89a051fe61f3b7a895b72adf5cc8fb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_22.txt @@ -0,0 +1 @@ +Choosing different composers in Pop2PianoForConditionalGeneration.generate() can lead to variety of different results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_23.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c740c3ebb1d6e8c53a41f3c9bb991ba322c5bfd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_23.txt @@ -0,0 +1 @@ +Setting the sampling rate to 44.1 kHz when loading the audio file can give good performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_24.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ebb391c17a52f63be1fbfec2604c59c1225677a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_24.txt @@ -0,0 +1 @@ +Though Pop2Piano was mainly trained on Korean Pop music, it also does pretty well on other Western Pop or Hip Hop songs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_25.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbbf6b9f7025442447751068c158ec77bf4c041f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_25.txt @@ -0,0 +1,27 @@ +Examples + +Example using HuggingFace Dataset: + +thon + +from datasets import load_dataset +from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor +model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") +ds = load_dataset("sweetcocoa/pop2piano_ci", split="test") +inputs = processor( + audio=ds["audio"][0]["array"], sampling_rate=ds["audio"][0]["sampling_rate"], return_tensors="pt" + ) +model_output = model.generate(input_features=inputs["input_features"], composer="composer1") +tokenizer_output = processor.batch_decode( + token_ids=model_output, feature_extractor_output=inputs + )["pretty_midi_objects"][0] +tokenizer_output.write("./Outputs/midi_output.mid") + +Example using your own audio file: + +thon + +import librosa +from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor +audio, sr = librosa.load("", sr=44100) # feel free to change the sr to a suitable value. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_26.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..28f5eeba54dd363c05d4be7144f7e21f21da3d34 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_26.txt @@ -0,0 +1,16 @@ +model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") +inputs = processor(audio=audio, sampling_rate=sr, return_tensors="pt") +model_output = model.generate(input_features=inputs["input_features"], composer="composer1") +tokenizer_output = processor.batch_decode( + token_ids=model_output, feature_extractor_output=inputs + )["pretty_midi_objects"][0] +tokenizer_output.write("./Outputs/midi_output.mid") + +Example of processing multiple audio files in batch: + +thon + +import librosa +from transformers import Pop2PianoForConditionalGeneration, Pop2PianoProcessor +feel free to change the sr to a suitable value. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_27.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4bd0744516a8da7755edfb037b6058991f83a3b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_27.txt @@ -0,0 +1,25 @@ +audio1, sr1 = librosa.load("", sr=44100) +audio2, sr2 = librosa.load("", sr=44100) +model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +processor = Pop2PianoProcessor.from_pretrained("sweetcocoa/pop2piano") +inputs = processor(audio=[audio1, audio2], sampling_rate=[sr1, sr2], return_attention_mask=True, return_tensors="pt") +Since we now generating in batch(2 audios) we must pass the attention_mask +model_output = model.generate( + input_features=inputs["input_features"], + attention_mask=inputs["attention_mask"], + composer="composer1", + ) +tokenizer_output = processor.batch_decode( + token_ids=model_output, feature_extractor_output=inputs + )["pretty_midi_objects"] +Since we now have 2 generated MIDI files +tokenizer_output[0].write("./Outputs/midi_output1.mid") +tokenizer_output[1].write("./Outputs/midi_output2.mid") + +Example of processing multiple audio files in batch (Using Pop2PianoFeatureExtractor and Pop2PianoTokenizer): + +thon + +import librosa +from transformers import Pop2PianoForConditionalGeneration, Pop2PianoFeatureExtractor, Pop2PianoTokenizer +feel free to change the sr to a suitable value. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_28.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..855ba803c19f6cbb66b577ce20aa3bec7c45eedd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_28.txt @@ -0,0 +1,39 @@ +audio1, sr1 = librosa.load("", sr=44100) +audio2, sr2 = librosa.load("", sr=44100) +model = Pop2PianoForConditionalGeneration.from_pretrained("sweetcocoa/pop2piano") +feature_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano") +tokenizer = Pop2PianoTokenizer.from_pretrained("sweetcocoa/pop2piano") +inputs = feature_extractor( + audio=[audio1, audio2], + sampling_rate=[sr1, sr2], + return_attention_mask=True, + return_tensors="pt", + ) +Since we now generating in batch(2 audios) we must pass the attention_mask +model_output = model.generate( + input_features=inputs["input_features"], + attention_mask=inputs["attention_mask"], + composer="composer1", + ) +tokenizer_output = tokenizer.batch_decode( + token_ids=model_output, feature_extractor_output=inputs + )["pretty_midi_objects"] +Since we now have 2 generated MIDI files +tokenizer_output[0].write("./Outputs/midi_output1.mid") +tokenizer_output[1].write("./Outputs/midi_output2.mid") + +Pop2PianoConfig +[[autodoc]] Pop2PianoConfig +Pop2PianoFeatureExtractor +[[autodoc]] Pop2PianoFeatureExtractor + - call +Pop2PianoForConditionalGeneration +[[autodoc]] Pop2PianoForConditionalGeneration + - forward + - generate +Pop2PianoTokenizer +[[autodoc]] Pop2PianoTokenizer + - call +Pop2PianoProcessor +[[autodoc]] Pop2PianoProcessor + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_5.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..10bd01b8b325b94e6d30f9628b35a9ab59bdc859 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_5.txt @@ -0,0 +1 @@ +Pop2Piano is an encoder-decoder Transformer model based on T5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_6.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..e129077bbae108a8837baa7c0ce334ac2a3d88bd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_6.txt @@ -0,0 +1,2 @@ +The input audio +is transformed to its waveform and passed to the encoder, which transforms it to a latent representation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_7.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..07bb30f464e05c32a18ce2ea8dad14bc382f2169 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_7.txt @@ -0,0 +1,2 @@ +The decoder +uses these latent representations to generate token ids in an autoregressive way. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_8.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..869f2f7a9a695f22a07ab4a9d1d434395951f30c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_8.txt @@ -0,0 +1,2 @@ +Each token id corresponds to one of four +different token types: time, velocity, note and 'special'. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pop2piano/chunk_9.txt b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..9254aec1143c28267595f6fc6d135ebcd35d1a9c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pop2piano/chunk_9.txt @@ -0,0 +1 @@ +The token ids are then decoded to their equivalent MIDI file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_prophetnet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..20f1548e9235bdcbe78540a39d9ccc6b3166fac1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_10.txt @@ -0,0 +1 @@ +The model architecture is based on the original Transformer, but replaces the “standard†self-attention mechanism in the decoder by a a main self-attention mechanism and a self and n-stream (predict) self-attention mechanism. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_prophetnet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..631cb1defefa87a134c771702b1aa99031a7a7ee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_11.txt @@ -0,0 +1,30 @@ +Resources + +Causal language modeling task guide +Translation task guide +Summarization task guide + +ProphetNetConfig +[[autodoc]] ProphetNetConfig +ProphetNetTokenizer +[[autodoc]] ProphetNetTokenizer +ProphetNet specific outputs +[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput +[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqModelOutput +[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderModelOutput +[[autodoc]] models.prophetnet.modeling_prophetnet.ProphetNetDecoderLMOutput +ProphetNetModel +[[autodoc]] ProphetNetModel + - forward +ProphetNetEncoder +[[autodoc]] ProphetNetEncoder + - forward +ProphetNetDecoder +[[autodoc]] ProphetNetDecoder + - forward +ProphetNetForConditionalGeneration +[[autodoc]] ProphetNetForConditionalGeneration + - forward +ProphetNetForCausalLM +[[autodoc]] ProphetNetForCausalLM + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_prophetnet/chunk_5.txt b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..b11a4d8646b4a7b01bf314143f65a2c898c9fedb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_5.txt @@ -0,0 +1,2 @@ +We pre-train ProphetNet using a base scale dataset (16GB) and a large scale +dataset (160GB) respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_prophetnet/chunk_6.txt b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb8994110ca39fb5e8fea2bf4efaeffaf4242deb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_6.txt @@ -0,0 +1,2 @@ +Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for +abstractive summarization and question generation tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_prophetnet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd0be51aae1b11f030107017059fa099ff9850b3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_7.txt @@ -0,0 +1,2 @@ +Experimental results show that ProphetNet achieves new +state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_prophetnet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c3d0926c3557f620bec98b6a4bd85cc9cc7733a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_8.txt @@ -0,0 +1 @@ +The Authors' code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_prophetnet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..311ced30f63f8b86d820b774922f9cfa223a62f8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_prophetnet/chunk_9.txt @@ -0,0 +1,4 @@ +Usage tips + +ProphetNet is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than + the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pvt/chunk_10.txt b/chunked/content_aware_chunking/model_doc_pvt/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf03c4a69bd13a6024b6a0c57f8d8f5b3e8fd62d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pvt/chunk_10.txt @@ -0,0 +1,2 @@ +For example, with a comparable number of parameters, PVT+RetinaNet +achieves 40.4 AP on the COCO dataset, surpassing ResNet50+RetinNet (36.3 AP) by 4.1 absolute AP (see Figure 2). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pvt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_pvt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..10bfa0c7525b653af663d182a6787f457721f324 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pvt/chunk_11.txt @@ -0,0 +1,2 @@ +We hope +that PVT could serve as an alternative and useful backbone for pixel-level predictions and facilitate future research. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pvt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_pvt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..15f11dfb673c651c2011c8f74ea1e2e3e44f9a5e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pvt/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by Xrenya. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pvt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_pvt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pvt/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pvt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_pvt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa2ffdc45978f311ae3172390658c46c706de9a9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pvt/chunk_14.txt @@ -0,0 +1,19 @@ +PVTv1 on ImageNet-1K + +| Model variant |Size |Acc@1|Params (M)| +|--------------------|:-------:|:-------:|:------------:| +| PVT-Tiny | 224 | 75.1 | 13.2 | +| PVT-Small | 224 | 79.8 | 24.5 | +| PVT-Medium | 224 | 81.2 | 44.2 | +| PVT-Large | 224 | 81.7 | 61.4 | +PvtConfig +[[autodoc]] PvtConfig +PvtImageProcessor +[[autodoc]] PvtImageProcessor + - preprocess +PvtForImageClassification +[[autodoc]] PvtForImageClassification + - forward +PvtModel +[[autodoc]] PvtModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pvt/chunk_8.txt b/chunked/content_aware_chunking/model_doc_pvt/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..27496bb0c1f8fa69dbb2d5aba29af52817446d5c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pvt/chunk_8.txt @@ -0,0 +1,2 @@ +PVT inherits the advantages of both CNN and Transformer, making it a unified +backbone for various vision tasks without convolutions, where it can be used as a direct replacement for CNN backbones. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_pvt/chunk_9.txt b/chunked/content_aware_chunking/model_doc_pvt/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..102dde6eb95fb67b8816e8ce9b3dc23cf1de5567 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_pvt/chunk_9.txt @@ -0,0 +1,2 @@ +We validate PVT through extensive experiments, showing that it boosts the performance of many downstream tasks, including +object detection, instance and semantic segmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qdqbert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..a95c69af20530e84de42405a5602daef5219e72f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_10.txt @@ -0,0 +1,3 @@ +Set default quantizers +QDQBERT model adds fake quantization operations (pair of QuantizeLinear/DequantizeLinear ops) to BERT by +TensorQuantizer in Pytorch Quantization Toolkit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qdqbert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6223069921c94fa23e6648638df680651db2255 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_11.txt @@ -0,0 +1,2 @@ +TensorQuantizer is the module +for quantizing tensors, with QuantDescriptor defining how the tensor should be quantized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qdqbert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..1eb16479d1a73def03b84f23fac04c9ccdf58f79 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_12.txt @@ -0,0 +1,2 @@ +Refer to Pytorch +Quantization Toolkit userguide for more details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qdqbert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..295cabbbcd8f61ae2ab253f42a1878637f54af46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_13.txt @@ -0,0 +1 @@ +Before creating QDQBERT model, one has to set the default QuantDescriptor defining default tensor quantizers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qdqbert/chunk_14.txt b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7505e6bdfc63ec91a2a3b645bbd3d436c2c60304 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_14.txt @@ -0,0 +1,15 @@ +Example: +thon + +import pytorch_quantization.nn as quant_nn +from pytorch_quantization.tensor_quant import QuantDescriptor +The default tensor quantizer is set to use Max calibration method +input_desc = QuantDescriptor(num_bits=8, calib_method="max") +The default tensor quantizer is set to be per-channel quantization for weights +weight_desc = QuantDescriptor(num_bits=8, axis=((0,))) +quant_nn.QuantLinear.set_default_quant_desc_input(input_desc) +quant_nn.QuantLinear.set_default_quant_desc_weight(weight_desc) + +Calibration +Calibration is the terminology of passing data samples to the quantizer and deciding the best scaling factors for +tensors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qdqbert/chunk_15.txt b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ce38d1b33ea2469eb9671a40cf033055600137d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_15.txt @@ -0,0 +1,22 @@ +After setting up the tensor quantizers, one can use the following example to calibrate the model: +thon + +Find the TensorQuantizer and enable calibration +for name, module in model.named_modules(): + if name.endswith("_input_quantizer"): + module.enable_calib() + module.disable_quant() # Use full precision data to calibrate +Feeding data samples +model(x) + +Finalize calibration +for name, module in model.named_modules(): + if name.endswith("_input_quantizer"): + module.load_calib_amax() + module.enable_quant() +If running on GPU, it needs to call .cuda() again because new tensors will be created by calibration process +model.cuda() +Keep running the quantized model + +Export to ONNX +The goal of exporting to ONNX is to deploy inference by TensorRT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qdqbert/chunk_16.txt b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..bed88d35b54c6ff7593b3afaa7fa4afda38cf15f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_16.txt @@ -0,0 +1,2 @@ +Fake +quantization will be broken into a pair of QuantizeLinear/DequantizeLinear ONNX ops. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qdqbert/chunk_17.txt b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..c145cf3bdb819f596178e072bc080a6a58a62f95 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_17.txt @@ -0,0 +1,3 @@ +After setting static member of +TensorQuantizer to use Pytorch’s own fake quantization functions, fake quantized model can be exported to ONNX, follow +the instructions in torch.onnx. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qdqbert/chunk_18.txt b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..009848e1b5678c4ff01a7560965c808965111e34 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_18.txt @@ -0,0 +1,45 @@ +Example: +thon + +from pytorch_quantization.nn import TensorQuantizer +TensorQuantizer.use_fb_fake_quant = True +Load the calibrated model + +ONNX export +torch.onnx.export() + +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +QDQBertConfig +[[autodoc]] QDQBertConfig +QDQBertModel +[[autodoc]] QDQBertModel + - forward +QDQBertLMHeadModel +[[autodoc]] QDQBertLMHeadModel + - forward +QDQBertForMaskedLM +[[autodoc]] QDQBertForMaskedLM + - forward +QDQBertForSequenceClassification +[[autodoc]] QDQBertForSequenceClassification + - forward +QDQBertForNextSentencePrediction +[[autodoc]] QDQBertForNextSentencePrediction + - forward +QDQBertForMultipleChoice +[[autodoc]] QDQBertForMultipleChoice + - forward +QDQBertForTokenClassification +[[autodoc]] QDQBertForTokenClassification + - forward +QDQBertForQuestionAnswering +[[autodoc]] QDQBertForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qdqbert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..29c9a7a4ef60f88c8a851cdbda21ff9a815e0649 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qdqbert/chunk_9.txt @@ -0,0 +1,2 @@ +A complete example of using QDQBERT model to perform Quatization Aware Training and Post Training Quantization for + SQUAD task can be found at transformers/examples/research_projects/quantization-qdqbert/. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qwen2/chunk_3.txt b/chunked/content_aware_chunking/model_doc_qwen2/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..0089494730a4ceb4a76cde78bf548124e48e0045 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qwen2/chunk_3.txt @@ -0,0 +1 @@ +For each size, we release the base language model and the aligned chat model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qwen2/chunk_4.txt b/chunked/content_aware_chunking/model_doc_qwen2/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..53c96cbba0f789001dd9c310a71df68bbcc6db58 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qwen2/chunk_4.txt @@ -0,0 +1 @@ +It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qwen2/chunk_5.txt b/chunked/content_aware_chunking/model_doc_qwen2/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..700be05fbb2774dc9697c1fc58b0877bc5c95416 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qwen2/chunk_5.txt @@ -0,0 +1 @@ +Additionally, we have an improved tokenizer adaptive to multiple natural languages and codes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qwen2/chunk_6.txt b/chunked/content_aware_chunking/model_doc_qwen2/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbc880a4027fab0d6cf48d3081c973ec2a1edc5b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qwen2/chunk_6.txt @@ -0,0 +1,3 @@ +Usage tips +Qwen2-7B-beta and Qwen2-7B-Chat-beta can be found on the Huggingface Hub +In the following, we demonstrate how to use Qwen2-7B-Chat-beta for the inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qwen2/chunk_7.txt b/chunked/content_aware_chunking/model_doc_qwen2/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d1efba24e5dadf2180553b103888ad00dfe9099 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qwen2/chunk_7.txt @@ -0,0 +1 @@ +Note that we have used the ChatML format for dialog, in this demo we show how to leverage apply_chat_template for this purpose. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qwen2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_qwen2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..0212463567a4c60a5683f68aac3846c131aa0776 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qwen2/chunk_8.txt @@ -0,0 +1,7 @@ +thon + +from transformers import AutoModelForCausalLM, AutoTokenizer +device = "cuda" # the device to load the model onto +model = AutoModelForCausalLM.from_pretrained("Qwen2/Qwen2-7B-Chat-beta", device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("Qwen2/Qwen2-7B-Chat-beta") +prompt = "Give me a short introduction to large language model." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_qwen2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_qwen2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..359417929c3d727dd8f16b0a43618768d20af5d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_qwen2/chunk_9.txt @@ -0,0 +1,23 @@ +messages = [{"role": "user", "content": prompt}] +text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +model_inputs = tokenizer([text], return_tensors="pt").to(device) +generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True) +generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)] +response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] + +Qwen2Config +[[autodoc]] Qwen2Config +Qwen2Tokenizer +[[autodoc]] Qwen2Tokenizer + - save_vocabulary +Qwen2TokenizerFast +[[autodoc]] Qwen2TokenizerFast +Qwen2Model +[[autodoc]] Qwen2Model + - forward +Qwen2ForCausalLM +[[autodoc]] Qwen2ForCausalLM + - forward +Qwen2ForSequenceClassification +[[autodoc]] Qwen2ForSequenceClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rag/chunk_10.txt b/chunked/content_aware_chunking/model_doc_rag/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..339bbfe636b341879ac5baf00e37e4f8cf1d9a75 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rag/chunk_10.txt @@ -0,0 +1,2 @@ +We compare two RAG formulations, one which conditions on the same retrieved passages +across the whole generated sequence, the other can use different passages per token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rag/chunk_11.txt b/chunked/content_aware_chunking/model_doc_rag/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e7e8650c77d79eca6deb731b49a59285d16595c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rag/chunk_11.txt @@ -0,0 +1,3 @@ +We fine-tune and evaluate our +models on a wide range of knowledge-intensive NLP tasks and set the state-of-the-art on three open domain QA tasks, +outperforming parametric seq2seq models and task-specific retrieve-and-extract architectures. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rag/chunk_12.txt b/chunked/content_aware_chunking/model_doc_rag/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc5ea528ec670dd977495d7e3a4441a7edb5c10d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rag/chunk_12.txt @@ -0,0 +1,3 @@ +For language generation +tasks, we find that RAG models generate more specific, diverse and factual language than a state-of-the-art +parametric-only seq2seq baseline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rag/chunk_13.txt b/chunked/content_aware_chunking/model_doc_rag/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..598388d74a6b117e6d0ae676bf0dcad0b02333a0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rag/chunk_13.txt @@ -0,0 +1 @@ +This model was contributed by ola13. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rag/chunk_14.txt b/chunked/content_aware_chunking/model_doc_rag/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..515993ac3e6a864cfe53bf90cb2fec7a07cf80de --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rag/chunk_14.txt @@ -0,0 +1,2 @@ +Usage tips +Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and Seq2Seq models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rag/chunk_15.txt b/chunked/content_aware_chunking/model_doc_rag/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b697efe6a64a79574c074e6a10b7294338c01c23 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rag/chunk_15.txt @@ -0,0 +1 @@ +RAG models retrieve docs, pass them to a seq2seq model, then marginalize to generate outputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rag/chunk_16.txt b/chunked/content_aware_chunking/model_doc_rag/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d39687fb2193e75e4538b13ef543b4ca4f35dc4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rag/chunk_16.txt @@ -0,0 +1,3 @@ +The retriever and seq2seq +modules are initialized from pretrained models, and fine-tuned jointly, allowing both retrieval and generation to adapt +to downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rag/chunk_17.txt b/chunked/content_aware_chunking/model_doc_rag/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..878820394a8095f775715f52a83a608d215e4197 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rag/chunk_17.txt @@ -0,0 +1,33 @@ +RagConfig +[[autodoc]] RagConfig +RagTokenizer +[[autodoc]] RagTokenizer +Rag specific outputs +[[autodoc]] models.rag.modeling_rag.RetrievAugLMMarginOutput +[[autodoc]] models.rag.modeling_rag.RetrievAugLMOutput +RagRetriever +[[autodoc]] RagRetriever + +RagModel +[[autodoc]] RagModel + - forward +RagSequenceForGeneration +[[autodoc]] RagSequenceForGeneration + - forward + - generate +RagTokenForGeneration +[[autodoc]] RagTokenForGeneration + - forward + - generate + +TFRagModel +[[autodoc]] TFRagModel + - call +TFRagSequenceForGeneration +[[autodoc]] TFRagSequenceForGeneration + - call + - generate +TFRagTokenForGeneration +[[autodoc]] TFRagTokenForGeneration + - call + - generate \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rag/chunk_7.txt b/chunked/content_aware_chunking/model_doc_rag/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..8205d44baf62f9b45cc8e488246fc56b9743c215 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rag/chunk_7.txt @@ -0,0 +1,2 @@ +Pre-trained models with a differentiable access mechanism to explicit nonparametric +memory can overcome this issue, but have so far been only investigated for extractive downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rag/chunk_8.txt b/chunked/content_aware_chunking/model_doc_rag/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..af96ef021612517f53d1632872b7ead812fbdc73 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rag/chunk_8.txt @@ -0,0 +1,3 @@ +We explore a +general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine pre-trained +parametric and non-parametric memory for language generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rag/chunk_9.txt b/chunked/content_aware_chunking/model_doc_rag/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d740fb4dd6b77425edc03e2a0d05a961a3d2a7c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rag/chunk_9.txt @@ -0,0 +1,3 @@ +We introduce RAG models where the parametric memory is a +pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a +pre-trained neural retriever. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_realm/chunk_10.txt b/chunked/content_aware_chunking/model_doc_realm/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6e168c61b9c92a89d32255643b255eccaceefc6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_realm/chunk_10.txt @@ -0,0 +1,30 @@ +RealmConfig +[[autodoc]] RealmConfig +RealmTokenizer +[[autodoc]] RealmTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + - batch_encode_candidates +RealmTokenizerFast +[[autodoc]] RealmTokenizerFast + - batch_encode_candidates +RealmRetriever +[[autodoc]] RealmRetriever +RealmEmbedder +[[autodoc]] RealmEmbedder + - forward +RealmScorer +[[autodoc]] RealmScorer + - forward +RealmKnowledgeAugEncoder +[[autodoc]] RealmKnowledgeAugEncoder + - forward +RealmReader +[[autodoc]] RealmReader + - forward +RealmForOpenQA +[[autodoc]] RealmForOpenQA + - block_embedding_to + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_realm/chunk_6.txt b/chunked/content_aware_chunking/model_doc_realm/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7488df54afe06381494d4032c8336a2b8ede572 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_realm/chunk_6.txt @@ -0,0 +1,3 @@ +We +demonstrate the effectiveness of Retrieval-Augmented Language Model pre-training (REALM) by fine-tuning on the +challenging task of Open-domain Question Answering (Open-QA). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_realm/chunk_7.txt b/chunked/content_aware_chunking/model_doc_realm/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae652df44d2cd3311a3c177d03cf642055a6fd94 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_realm/chunk_7.txt @@ -0,0 +1,4 @@ +We compare against state-of-the-art models for both +explicit and implicit knowledge storage on three popular Open-QA benchmarks, and find that we outperform all previous +methods by a significant margin (4-16% absolute accuracy), while also providing qualitative benefits such as +interpretability and modularity. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_realm/chunk_8.txt b/chunked/content_aware_chunking/model_doc_realm/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..41f9b4bfc46e5b1ed876e67b9249d4901d184910 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_realm/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by qqaatw. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_realm/chunk_9.txt b/chunked/content_aware_chunking/model_doc_realm/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f16a20874db68468d36337353044b26ede99569 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_realm/chunk_9.txt @@ -0,0 +1,2 @@ +The original code can be found +here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d89a687465af403480f8d4ce1b0f1759fbd7fd3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_16.txt @@ -0,0 +1,3 @@ +In models that are treating very long input sequences, the +conventional position id encodings store an embeddings vector of size \(d\) being the config.hidden_size for +every position \(i, \ldots, n_s\), with \(n_s\) being config.max_embedding_size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_17.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..105dc218f8992862ced44208173bd439fc02f7bb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_17.txt @@ -0,0 +1,5 @@ +This means that having +a sequence length of \(n_s = 2^{19} \approx 0.5M\) and a config.hidden_size of \(d = 2^{10} \approx 1000\) +would result in a position encoding matrix: +$$X_{i,j}, \text{ with } i \in \left[1,\ldots, d\right] \text{ and } j \in \left[1,\ldots, n_s\right]$$ +which alone has over 500M parameters to store. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_18.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ff4309645f8b9b7ab1f3293b7d270bb6fa92dd2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_18.txt @@ -0,0 +1,14 @@ +Axial positional encodings factorize \(X_{i,j}\) into two matrices: +$$X^{1}_{i,j}, \text{ with } i \in \left[1,\ldots, d^1\right] \text{ and } j \in \left[1,\ldots, n_s^1\right]$$ +and +$$X^{2}_{i,j}, \text{ with } i \in \left[1,\ldots, d^2\right] \text{ and } j \in \left[1,\ldots, n_s^2\right]$$ +with: +$$d = d^1 + d^2 \text{ and } n_s = n_s^1 \times n_s^2 .$$ +Therefore the following holds: +$$X_{i,j} = \begin{cases} +X^{1}{i, k}, & \text{if }\ i < d^1 \text{ with } k = j \mod n_s^1 \ +X^{2}{i - d^1, l}, & \text{if } i \ge d^1 \text{ with } l = \lfloor\frac{j}{n_s^1}\rfloor +\end{cases}$$ +Intuitively, this means that a position embedding vector \(x_j \in \mathbb{R}^{d}\) is now the composition of two +factorized embedding vectors: \(x^1_{k, l} + x^2_{l, k}\), where as the config.max_embedding_size dimension +\(j\) is factorized into \(k \text{ and } l\). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_19.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..21ef6a279b49a79ce82ab0a64fb0f9ef9423a58b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_19.txt @@ -0,0 +1,2 @@ +This design ensures that each position embedding vector +\(x_j\) is unique. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_20.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..c825d216687822df76d1f77d819b3ed7b0de7926 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_20.txt @@ -0,0 +1,2 @@ +Using the above example again, axial position encoding with \(d^1 = 2^9, d^2 = 2^9, n_s^1 = 2^9, n_s^2 = 2^{10}\) +can drastically reduced the number of parameters from 500 000 000 to \(2^{18} + 2^{19} \approx 780 000\) parameters, this means 85% less memory usage. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_21.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ee5be44cf30d529e26c3a4bd14128a2334e0689 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_21.txt @@ -0,0 +1,4 @@ +In practice, the parameter config.axial_pos_embds_dim is set to a tuple \((d^1, d^2)\) which sum has to be +equal to config.hidden_size and config.axial_pos_shape is set to a tuple \((n_s^1, n_s^2)\) which +product has to be equal to config.max_embedding_size, which during training has to be equal to the sequence +length of the input_ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_22.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..63c2850409b047a1134b1643d5538327014c5852 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_22.txt @@ -0,0 +1,2 @@ +LSH Self Attention +In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_23.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..166ef04d521e91aa328cfe00831e9a7f3210e906 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_23.txt @@ -0,0 +1,2 @@ +Therefore, the key +query embedding vectors are also tied. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_24.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..60e228594d6bcca8b9df2b241a64a0be8680a743 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_24.txt @@ -0,0 +1,3 @@ +LSH self attention uses the locality sensitive hashing mechanism proposed in +Practical and Optimal LSH for Angular Distance to assign each of the tied key +query embedding vectors to one of config.num_buckets possible buckets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_25.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..08b7bd7a8943b8e8e8a3c435e308e6bdf432ea98 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_25.txt @@ -0,0 +1,3 @@ +The premise is that the more "similar" +key query embedding vectors (in terms of cosine similarity) are to each other, the more likely they are assigned to +the same bucket. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_26.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d820d093da4eca9a6a695da0b8dda49635abf27 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_26.txt @@ -0,0 +1,3 @@ +The accuracy of the LSH mechanism can be improved by increasing config.num_hashes or directly the argument +num_hashes of the forward function so that the output of the LSH self attention better approximates the output +of the "normal" full self attention. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_27.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..b88174ab46a473415cb10f5625d559441e01fc12 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_27.txt @@ -0,0 +1,2 @@ +The buckets are then sorted and chunked into query key embedding vector chunks +each of length config.lsh_chunk_length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_28.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..c089ec5d111ee0c209d4d10bcae13121eec27a29 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_28.txt @@ -0,0 +1,3 @@ +For each chunk, the query embedding vectors attend to its key vectors +(which are tied to themselves) and to the key embedding vectors of config.lsh_num_chunks_before previous +neighboring chunks and config.lsh_num_chunks_after following neighboring chunks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_29.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..7494571e609078babb547b9a8028c04aabdbd8f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_29.txt @@ -0,0 +1 @@ +For more information, see the original Paper or this great blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_30.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..88f438f3de1bc48bbf4008fa17983db601583451 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_30.txt @@ -0,0 +1,2 @@ +Note that config.num_buckets can also be factorized into a list \((n_{\text{buckets}}^1, +n_{\text{buckets}}^2)\). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_31.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc7066065d2dae2029f38db0f8c7b0bfa172267b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_31.txt @@ -0,0 +1,3 @@ +This way instead of assigning the query key embedding vectors to one of \((1,\ldots, +n_{\text{buckets}})\) they are assigned to one of \((1-1,\ldots, n_{\text{buckets}}^1-1, \ldots, +1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)\). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_32.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2d2b26346913992cad41fd080f572c08c0ed331 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_32.txt @@ -0,0 +1,2 @@ +This is crucial for very long sequences to +save memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_33.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..886721473ecee01cdb9af1ffcd8cbcd63264e59f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_33.txt @@ -0,0 +1,2 @@ +When training a model from scratch, it is recommended to leave config.num_buckets=None, so that depending on the +sequence length a good value for num_buckets is calculated on the fly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_34.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..70e866201250f4ecd5bf898495294d7720a7fc50 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_34.txt @@ -0,0 +1,2 @@ +This value will then automatically be +saved in the config and should be reused for inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_35.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..276a752e618a34c6973d92b79ce393010c33fc92 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_35.txt @@ -0,0 +1,3 @@ +Using LSH self attention, the memory and time complexity of the query-key matmul operation can be reduced from +\(\mathcal{O}(n_s \times n_s)\) to \(\mathcal{O}(n_s \times \log(n_s))\), which usually represents the memory +and time bottleneck in a transformer model, with \(n_s\) being the sequence length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_36.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1373d2bbe7096fe4b2022030c246918ab71c24d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_36.txt @@ -0,0 +1,5 @@ +Local Self Attention +Local self attention is essentially a "normal" self attention layer with key, query and value projections, but is +chunked so that in each chunk of length config.local_chunk_length the query embedding vectors only attends to +the key embedding vectors in its chunk and to the key embedding vectors of config.local_num_chunks_before +previous neighboring chunks and config.local_num_chunks_after following neighboring chunks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_37.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3a31303c157e0e5ae5103bb81f31891298a708a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_37.txt @@ -0,0 +1,3 @@ +Using Local self attention, the memory and time complexity of the query-key matmul operation can be reduced from +\(\mathcal{O}(n_s \times n_s)\) to \(\mathcal{O}(n_s \times \log(n_s))\), which usually represents the memory +and time bottleneck in a transformer model, with \(n_s\) being the sequence length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_38.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..197c0ac2afc12d84d82aed81a63148e32d3e5bae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_38.txt @@ -0,0 +1,4 @@ +Training +During training, we must ensure that the sequence length is set to a value that can be divided by the least common +multiple of config.lsh_chunk_length and config.local_chunk_length and that the parameters of the Axial +Positional Encodings are correctly set as described above. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_39.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..39903aa7c0dc9cc26832b1b6802fa783b69b2816 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_39.txt @@ -0,0 +1,2 @@ +Reformer is very memory efficient so that the model can +easily be trained on sequences as long as 64000 tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_reformer/chunk_40.txt b/chunked/content_aware_chunking/model_doc_reformer/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bf6926da075912381a2d1e707543382c53076f0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_reformer/chunk_40.txt @@ -0,0 +1,33 @@ +For training, the [ReformerModelWithLMHead] should be used as follows: +python +input_ids = tokenizer.encode("This is a sentence from the training data", return_tensors="pt") +loss = model(input_ids, labels=input_ids)[0] +Resources + +Text classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide + +ReformerConfig +[[autodoc]] ReformerConfig +ReformerTokenizer +[[autodoc]] ReformerTokenizer + - save_vocabulary +ReformerTokenizerFast +[[autodoc]] ReformerTokenizerFast +ReformerModel +[[autodoc]] ReformerModel + - forward +ReformerModelWithLMHead +[[autodoc]] ReformerModelWithLMHead + - forward +ReformerForMaskedLM +[[autodoc]] ReformerForMaskedLM + - forward +ReformerForSequenceClassification +[[autodoc]] ReformerForSequenceClassification + - forward +ReformerForQuestionAnswering +[[autodoc]] ReformerForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..1186b497d160247491ab894c2cd96f54ed4f04ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_10.txt @@ -0,0 +1 @@ +The RegNet design space provides simple and fast networks that work well across a wide range of flop regimes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ba22723ef750872618ace51a4e1d6196875321d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_11.txt @@ -0,0 +1 @@ +Under comparable training settings and flops, the RegNet models outperform the popular EfficientNet models while being up to 5x faster on GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_12.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..593cf6dd0ebb63de7187a7a416ba0ce3c13e685e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by Francesco. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_13.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..16f5765a10feaf30de10adb0b15a4014a3ecacf6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_13.txt @@ -0,0 +1,2 @@ +The TensorFlow version of the model +was contributed by sayakpaul and ariG23498. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_14.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_15.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b18b4eef9c4ac698bd0db611486030d7403ee439 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_15.txt @@ -0,0 +1,4 @@ +The huge 10B model from Self-supervised Pretraining of Visual Features in the Wild, +trained on one billion Instagram images, is available on the hub +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RegNet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_16.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b310fad17de3efecd8b3de104271d0e58f1b544 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_16.txt @@ -0,0 +1 @@ +[RegNetForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_17.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_17.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_18.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_18.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_19.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f444c66ceba2dd6077e1b7d866002ab4de8d2fd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_19.txt @@ -0,0 +1,23 @@ +RegNetConfig +[[autodoc]] RegNetConfig + +RegNetModel +[[autodoc]] RegNetModel + - forward +RegNetForImageClassification +[[autodoc]] RegNetForImageClassification + - forward + +TFRegNetModel +[[autodoc]] TFRegNetModel + - call +TFRegNetForImageClassification +[[autodoc]] TFRegNetForImageClassification + - call + +FlaxRegNetModel +[[autodoc]] FlaxRegNetModel + - call +FlaxRegNetForImageClassification +[[autodoc]] FlaxRegNetForImageClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_6.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..29277478004da4750fe24a0cadc788db5d3b537a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_6.txt @@ -0,0 +1 @@ +The overall process is analogous to classic manual design of networks, but elevated to the design space level. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..473b2ce98eb3cde1a700f67af03796f47a8d2c6f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_7.txt @@ -0,0 +1 @@ +Using our methodology we explore the structure aspect of network design and arrive at a low-dimensional design space consisting of simple, regular networks that we call RegNet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..50b7ca805d91cafd3e86ee0cf2fa9b91cc1de997 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_8.txt @@ -0,0 +1 @@ +The core insight of the RegNet parametrization is surprisingly simple: widths and depths of good networks can be explained by a quantized linear function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_regnet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_regnet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d6e41b2304a192c6f346bc714a015a8879b0409 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_regnet/chunk_9.txt @@ -0,0 +1 @@ +We analyze the RegNet design space and arrive at interesting findings that do not match the current practice of network design. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rembert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_rembert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..44fe1bfda486aa817f13ea80d9b59099b3a0bc80 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rembert/chunk_10.txt @@ -0,0 +1,67 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +RemBertConfig +[[autodoc]] RemBertConfig +RemBertTokenizer +[[autodoc]] RemBertTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +RemBertTokenizerFast +[[autodoc]] RemBertTokenizerFast + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + +RemBertModel +[[autodoc]] RemBertModel + - forward +RemBertForCausalLM +[[autodoc]] RemBertForCausalLM + - forward +RemBertForMaskedLM +[[autodoc]] RemBertForMaskedLM + - forward +RemBertForSequenceClassification +[[autodoc]] RemBertForSequenceClassification + - forward +RemBertForMultipleChoice +[[autodoc]] RemBertForMultipleChoice + - forward +RemBertForTokenClassification +[[autodoc]] RemBertForTokenClassification + - forward +RemBertForQuestionAnswering +[[autodoc]] RemBertForQuestionAnswering + - forward + +TFRemBertModel +[[autodoc]] TFRemBertModel + - call +TFRemBertForMaskedLM +[[autodoc]] TFRemBertForMaskedLM + - call +TFRemBertForCausalLM +[[autodoc]] TFRemBertForCausalLM + - call +TFRemBertForSequenceClassification +[[autodoc]] TFRemBertForSequenceClassification + - call +TFRemBertForMultipleChoice +[[autodoc]] TFRemBertForMultipleChoice + - call +TFRemBertForTokenClassification +[[autodoc]] TFRemBertForTokenClassification + - call +TFRemBertForQuestionAnswering +[[autodoc]] TFRemBertForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rembert/chunk_6.txt b/chunked/content_aware_chunking/model_doc_rembert/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..6288aab182a8bd524abeaffa3ad7579e59a3479b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rembert/chunk_6.txt @@ -0,0 +1,3 @@ +Harnessing these +findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the +number of parameters at the fine-tuning stage. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rembert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_rembert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3c4751970a884481a18abb466761017f2dceaf9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rembert/chunk_7.txt @@ -0,0 +1,3 @@ +Usage tips +For fine-tuning, RemBERT can be thought of as a bigger version of mBERT with an ALBERT-like factorization of the +embedding layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rembert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_rembert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b40fc06a5ead5064942b3a9c3eefa16a4fdd2f15 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rembert/chunk_8.txt @@ -0,0 +1,2 @@ +The embeddings are not tied in pre-training, in contrast with BERT, which enables smaller input +embeddings (preserved during fine-tuning) and bigger output embeddings (discarded at fine-tuning). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rembert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_rembert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..a847f39a171a3e22700812cc404bbd50b65681a6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rembert/chunk_9.txt @@ -0,0 +1,2 @@ +The tokenizer is +also similar to the Albert one rather than the BERT one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f222fffb79f486f351b6471ca0ce88447908f4d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_10.txt @@ -0,0 +1 @@ +An ensemble of these residual nets achieves 3.57% error on the ImageNet test set. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..717117fc69eb2487285b128fc4fce48d31eff799 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_11.txt @@ -0,0 +1 @@ +This result won the 1st place on the ILSVRC 2015 classification task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_12.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d906bf2afa099e697968038a60d356b4f5a84714 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_12.txt @@ -0,0 +1 @@ +We also present analysis on CIFAR-10 with 100 and 1000 layers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_13.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..060cc1eaa8e3e56117d729971c24ebbabce54fbd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_13.txt @@ -0,0 +1 @@ +The depth of representations is of central importance for many visual recognition tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_14.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb872ad6a75f097e47880c8be96f89e24aec2ce0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_14.txt @@ -0,0 +1 @@ +Solely due to our extremely deep representations, we obtain a 28% relative improvement on the COCO object detection dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_15.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..382d250291acf944ab0e66171974fbdc584ec024 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_15.txt @@ -0,0 +1 @@ +Deep residual nets are foundations of our submissions to ILSVRC & COCO 2015 competitions, where we also won the 1st places on the tasks of ImageNet detection, ImageNet localization, COCO detection, and COCO segmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_16.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..38fcf1b15b714ebce8d66a9bf24f08ce802ca2cc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_16.txt @@ -0,0 +1 @@ +The figure below illustrates the architecture of ResNet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_17.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_17.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_18.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..593cf6dd0ebb63de7187a7a416ba0ce3c13e685e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_18.txt @@ -0,0 +1 @@ +This model was contributed by Francesco. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_19.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c9cd9b48231b2e76f2f2941303830dca79b6041 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_19.txt @@ -0,0 +1 @@ +The TensorFlow version of this model was added by amyeroberts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_20.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_20.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_21.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1cf1bfe6c01e46206034713c8985865a4bfd9a6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_21.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ResNet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_22.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..99b3aa4de23f6e325dbdc40b99ca3079f34a4a31 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_22.txt @@ -0,0 +1 @@ +[ResNetForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_23.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_23.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_24.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_24.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_25.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..10dba868878a190885cafaf14be3f197873ef62c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_25.txt @@ -0,0 +1,23 @@ +ResNetConfig +[[autodoc]] ResNetConfig + +ResNetModel +[[autodoc]] ResNetModel + - forward +ResNetForImageClassification +[[autodoc]] ResNetForImageClassification + - forward + +TFResNetModel +[[autodoc]] TFResNetModel + - call +TFResNetForImageClassification +[[autodoc]] TFResNetForImageClassification + - call + +FlaxResNetModel +[[autodoc]] FlaxResNetModel + - call +FlaxResNetForImageClassification +[[autodoc]] FlaxResNetForImageClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_6.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc483db1b7ccc21f9c0a43f6d8fdd7354167d10f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_6.txt @@ -0,0 +1 @@ +We present a residual learning framework to ease the training of networks that are substantially deeper than those used previously. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..a098191b612d8bc7d109abe8dac8fe6893b4631e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_7.txt @@ -0,0 +1 @@ +We explicitly reformulate the layers as learning residual functions with reference to the layer inputs, instead of learning unreferenced functions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..9849535d54fabcf13df3022248b5ceb4e29d17c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_8.txt @@ -0,0 +1 @@ +We provide comprehensive empirical evidence showing that these residual networks are easier to optimize, and can gain accuracy from considerably increased depth. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_resnet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_resnet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8833ae690d6b8fd4143d6187c46da463083a47a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_resnet/chunk_9.txt @@ -0,0 +1 @@ +On the ImageNet dataset we evaluate residual nets with a depth of up to 152 layers---8x deeper than VGG nets but still having lower complexity. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_retribert/chunk_2.txt b/chunked/content_aware_chunking/model_doc_retribert/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b8083f6eef3375f354064f09a00d75e1be1c359 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_retribert/chunk_2.txt @@ -0,0 +1 @@ +You can do so by running the following command: pip install -U transformers==4.30.0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_retribert/chunk_3.txt b/chunked/content_aware_chunking/model_doc_retribert/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c7bb115e1f6f7bc08635def66ac7b2bac6eae2a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_retribert/chunk_3.txt @@ -0,0 +1,3 @@ +Overview +The RetriBERT model was proposed in the blog post Explain Anything Like I'm Five: A Model for Open Domain Long Form +Question Answering. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_retribert/chunk_4.txt b/chunked/content_aware_chunking/model_doc_retribert/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..524548b7021d8812a6861e273b4f6783ad0d7291 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_retribert/chunk_4.txt @@ -0,0 +1,2 @@ +RetriBERT is a small model that uses either a single or +pair of BERT encoders with lower-dimension projection for dense semantic indexing of text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_retribert/chunk_5.txt b/chunked/content_aware_chunking/model_doc_retribert/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4363d96966851370cf776c39cb31deff94684b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_retribert/chunk_5.txt @@ -0,0 +1 @@ +This model was contributed by yjernite. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_retribert/chunk_6.txt b/chunked/content_aware_chunking/model_doc_retribert/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..af1d89fc9045ce904ce6d962219c1462fa9c85a1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_retribert/chunk_6.txt @@ -0,0 +1,2 @@ +Code to train and use the model can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_retribert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_retribert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf5faa22d05db49cb948f5dbdc8284b176fcb88d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_retribert/chunk_7.txt @@ -0,0 +1,9 @@ +RetriBertConfig +[[autodoc]] RetriBertConfig +RetriBertTokenizer +[[autodoc]] RetriBertTokenizer +RetriBertTokenizerFast +[[autodoc]] RetriBertTokenizerFast +RetriBertModel +[[autodoc]] RetriBertModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_10.txt b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca92edf2b8a34e949c588bdf6eec630c2f99bcd2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_10.txt @@ -0,0 +1,77 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +RobertaPreLayerNormConfig +[[autodoc]] RobertaPreLayerNormConfig + +RobertaPreLayerNormModel +[[autodoc]] RobertaPreLayerNormModel + - forward +RobertaPreLayerNormForCausalLM +[[autodoc]] RobertaPreLayerNormForCausalLM + - forward +RobertaPreLayerNormForMaskedLM +[[autodoc]] RobertaPreLayerNormForMaskedLM + - forward +RobertaPreLayerNormForSequenceClassification +[[autodoc]] RobertaPreLayerNormForSequenceClassification + - forward +RobertaPreLayerNormForMultipleChoice +[[autodoc]] RobertaPreLayerNormForMultipleChoice + - forward +RobertaPreLayerNormForTokenClassification +[[autodoc]] RobertaPreLayerNormForTokenClassification + - forward +RobertaPreLayerNormForQuestionAnswering +[[autodoc]] RobertaPreLayerNormForQuestionAnswering + - forward + +TFRobertaPreLayerNormModel +[[autodoc]] TFRobertaPreLayerNormModel + - call +TFRobertaPreLayerNormForCausalLM +[[autodoc]] TFRobertaPreLayerNormForCausalLM + - call +TFRobertaPreLayerNormForMaskedLM +[[autodoc]] TFRobertaPreLayerNormForMaskedLM + - call +TFRobertaPreLayerNormForSequenceClassification +[[autodoc]] TFRobertaPreLayerNormForSequenceClassification + - call +TFRobertaPreLayerNormForMultipleChoice +[[autodoc]] TFRobertaPreLayerNormForMultipleChoice + - call +TFRobertaPreLayerNormForTokenClassification +[[autodoc]] TFRobertaPreLayerNormForTokenClassification + - call +TFRobertaPreLayerNormForQuestionAnswering +[[autodoc]] TFRobertaPreLayerNormForQuestionAnswering + - call + +FlaxRobertaPreLayerNormModel +[[autodoc]] FlaxRobertaPreLayerNormModel + - call +FlaxRobertaPreLayerNormForCausalLM +[[autodoc]] FlaxRobertaPreLayerNormForCausalLM + - call +FlaxRobertaPreLayerNormForMaskedLM +[[autodoc]] FlaxRobertaPreLayerNormForMaskedLM + - call +FlaxRobertaPreLayerNormForSequenceClassification +[[autodoc]] FlaxRobertaPreLayerNormForSequenceClassification + - call +FlaxRobertaPreLayerNormForMultipleChoice +[[autodoc]] FlaxRobertaPreLayerNormForMultipleChoice + - call +FlaxRobertaPreLayerNormForTokenClassification +[[autodoc]] FlaxRobertaPreLayerNormForTokenClassification + - call +FlaxRobertaPreLayerNormForQuestionAnswering +[[autodoc]] FlaxRobertaPreLayerNormForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_4.txt b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa795397db1f3b56717e26fd1ee32a646e0eb885 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_4.txt @@ -0,0 +1 @@ +We also support fast mixed-precision training and inference on modern GPUs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_5.txt b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..95086b1115954fd54db25a30ecd00613d425c69f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_5.txt @@ -0,0 +1 @@ +This model was contributed by andreasmaden. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_6.txt b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_6.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_7.txt b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..03d56e7d0e7a7fcc1641e6d2be6fd240e00f31aa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_7.txt @@ -0,0 +1,3 @@ +Usage tips + +The implementation is the same as Roberta except instead of using Add and Norm it does Norm and Add. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_8.txt b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..952a9ff0a211954c163fa3b86ea84182f1542b04 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_8.txt @@ -0,0 +1 @@ +Add and Norm refers to the Addition and LayerNormalization as described in Attention Is All You Need. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_9.txt b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..cde630c54dc2a54732ed1be540d34b61134aa5dc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta-prelayernorm/chunk_9.txt @@ -0,0 +1 @@ +This is identical to using the --encoder-normalize-before flag in fairseq. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_10.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e26aae1b287b09bfaaddf592d7b633bc81e8d00 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by julien-c. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_11.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_12.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a1a2127dd0885ee27565771bce7b7bab5a08667 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_12.txt @@ -0,0 +1,4 @@ +Usage tips + +This implementation is the same as [BertModel] with a tiny embeddings tweak as well as a setup + for Roberta pretrained models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_13.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3b796ae0c7a6656d2ab212386ee81909905907e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_13.txt @@ -0,0 +1,2 @@ +RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a + different pretraining scheme. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_14.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e6f417faf62e445acf44e4ea0e87866da88f4db --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_14.txt @@ -0,0 +1 @@ +RoBERTa doesn't have token_type_ids, you don't need to indicate which token belongs to which segment. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_15.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a26e544d33af147fb79a3a48c780c9f87b0c6ec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_15.txt @@ -0,0 +1,10 @@ +Just + separate your segments with the separation token tokenizer.sep_token (or ) + +Same as BERT with better pretraining tricks: + +dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all +together to reach 512 tokens (so the sentences are in an order than may span several documents) +train with larger batches +use BPE with bytes as a subunit and not characters (because of unicode characters) +CamemBERT is a wrapper around RoBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_16.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1cdcc0271358371e87bb0fc2b7bd8ad6d9cc443 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_16.txt @@ -0,0 +1 @@ +Refer to this page for usage examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_17.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9a20574dfe3155ae6da0999c64b9ca64139d618 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_17.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with RoBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_18.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_18.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_19.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_19.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_20.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..8588342b9988e56493039a0f485b37006eed3792 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_20.txt @@ -0,0 +1 @@ +A blog on Getting Started with Sentiment Analysis on Twitter using RoBERTa and the Inference API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_21.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c4fe28aa7a139058a911798277174f320ab9a73 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_21.txt @@ -0,0 +1 @@ +A blog on Opinion Classification with Kili and Hugging Face AutoTrain using RoBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_22.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8be67ac0f66b33c32ae4e362da02018870124d92 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_22.txt @@ -0,0 +1 @@ +A notebook on how to finetune RoBERTa for sentiment analysis. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_23.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..92af56049ca8adb5a3afa82436f5832963101843 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_23.txt @@ -0,0 +1,2 @@ +🌎 +[RobertaForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_24.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ce6efe929a562ebc6b479f69865c9e6d3ad8e2c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_24.txt @@ -0,0 +1 @@ +[TFRobertaForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_25.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..66392dafa8b747ddeb32df6744688c649ba8ff2b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_25.txt @@ -0,0 +1 @@ +[FlaxRobertaForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_26.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..163be5244dc94759800f38eb84afc04b552ed190 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_26.txt @@ -0,0 +1,3 @@ +Text classification task guide + +[RobertaForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_27.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..615af20b1e59f9940c0c29bf95f801c2e95999c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_27.txt @@ -0,0 +1 @@ +[TFRobertaForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_28.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..95c2286182aa80542b14d605fb1852c16311cb82 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_28.txt @@ -0,0 +1 @@ +[FlaxRobertaForTokenClassification] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_29.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..69e21faf2c5098fb807509f480ff122a6a2859c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_29.txt @@ -0,0 +1 @@ +Token classification chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_30.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..2063c5f458fa0288ab8b2e360003fca4a965eed0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_30.txt @@ -0,0 +1,3 @@ +Token classification task guide + +A blog on How to train a new language model from scratch using Transformers and Tokenizers with RoBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_31.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..afe433dd00fd43d86204a19b4177154f6e570892 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_31.txt @@ -0,0 +1 @@ +[RobertaForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_32.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..dee0fa21aade8842bbd736b0d0e4fc74e405f91c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_32.txt @@ -0,0 +1 @@ +[TFRobertaForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_33.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..535f811c8105413669722ed0d2779405e1534d93 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_33.txt @@ -0,0 +1 @@ +[FlaxRobertaForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_34.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f2b5fefece97efd08b6147d0c598a5443817bec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_34.txt @@ -0,0 +1 @@ +Masked language modeling chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_35.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7e959f4bc6f96236053e2af949907c1eaf8dade --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_35.txt @@ -0,0 +1,3 @@ +Masked language modeling task guide + +A blog on Accelerated Inference with Optimum and Transformers Pipelines with RoBERTa for question answering. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_36.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..668cd9aab244d5e2736e78db17a5236cba033e8b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_36.txt @@ -0,0 +1 @@ +[RobertaForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_37.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..04adb5bdb8f299092bbdb69bc5d043f4d9a4cdaa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_37.txt @@ -0,0 +1 @@ +[TFRobertaForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_38.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..eca6fc78e272eca77c14c5716ce1fad82deba050 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_38.txt @@ -0,0 +1 @@ +[FlaxRobertaForQuestionAnswering] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_39.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..38996d3f4fef4d6454d1d2c12acfb05d3bf81ec8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_39.txt @@ -0,0 +1 @@ +Question answering chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_40.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..1884781f421330503a9e55401ba1cec7604be6d8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_40.txt @@ -0,0 +1,4 @@ +Question answering task guide + +Multiple choice +- [RobertaForMultipleChoice] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_41.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3b575438de5d3c5d4460c8e85e461da5a28443c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_41.txt @@ -0,0 +1 @@ +- [TFRobertaForMultipleChoice] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_42.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..15ecb9ee7c225b17f4199f0e13cc1c83b8dfc109 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_42.txt @@ -0,0 +1,78 @@ +- Multiple choice task guide +RobertaConfig +[[autodoc]] RobertaConfig +RobertaTokenizer +[[autodoc]] RobertaTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +RobertaTokenizerFast +[[autodoc]] RobertaTokenizerFast + - build_inputs_with_special_tokens + +RobertaModel +[[autodoc]] RobertaModel + - forward +RobertaForCausalLM +[[autodoc]] RobertaForCausalLM + - forward +RobertaForMaskedLM +[[autodoc]] RobertaForMaskedLM + - forward +RobertaForSequenceClassification +[[autodoc]] RobertaForSequenceClassification + - forward +RobertaForMultipleChoice +[[autodoc]] RobertaForMultipleChoice + - forward +RobertaForTokenClassification +[[autodoc]] RobertaForTokenClassification + - forward +RobertaForQuestionAnswering +[[autodoc]] RobertaForQuestionAnswering + - forward + +TFRobertaModel +[[autodoc]] TFRobertaModel + - call +TFRobertaForCausalLM +[[autodoc]] TFRobertaForCausalLM + - call +TFRobertaForMaskedLM +[[autodoc]] TFRobertaForMaskedLM + - call +TFRobertaForSequenceClassification +[[autodoc]] TFRobertaForSequenceClassification + - call +TFRobertaForMultipleChoice +[[autodoc]] TFRobertaForMultipleChoice + - call +TFRobertaForTokenClassification +[[autodoc]] TFRobertaForTokenClassification + - call +TFRobertaForQuestionAnswering +[[autodoc]] TFRobertaForQuestionAnswering + - call + +FlaxRobertaModel +[[autodoc]] FlaxRobertaModel + - call +FlaxRobertaForCausalLM +[[autodoc]] FlaxRobertaForCausalLM + - call +FlaxRobertaForMaskedLM +[[autodoc]] FlaxRobertaForMaskedLM + - call +FlaxRobertaForSequenceClassification +[[autodoc]] FlaxRobertaForSequenceClassification + - call +FlaxRobertaForMultipleChoice +[[autodoc]] FlaxRobertaForMultipleChoice + - call +FlaxRobertaForTokenClassification +[[autodoc]] FlaxRobertaForTokenClassification + - call +FlaxRobertaForQuestionAnswering +[[autodoc]] FlaxRobertaForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roberta/chunk_9.txt b/chunked/content_aware_chunking/model_doc_roberta/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..52170cd44cb068fc225286b4027bd8984db30b7f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roberta/chunk_9.txt @@ -0,0 +1 @@ +We release our models and code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roc_bert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e1876bbdc803177b4c59ce9982d2fb9fdf78389 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by weiweishi. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roc_bert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac06d2d040a659ce59f24153cfeb5ddaad0b20d6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_11.txt @@ -0,0 +1,42 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +RoCBertConfig +[[autodoc]] RoCBertConfig + - all +RoCBertTokenizer +[[autodoc]] RoCBertTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +RoCBertModel +[[autodoc]] RoCBertModel + - forward +RoCBertForPreTraining +[[autodoc]] RoCBertForPreTraining + - forward +RoCBertForCausalLM +[[autodoc]] RoCBertForCausalLM + - forward +RoCBertForMaskedLM +[[autodoc]] RoCBertForMaskedLM + - forward +RoCBertForSequenceClassification +[[autodoc]] transformers.RoCBertForSequenceClassification + - forward +RoCBertForMultipleChoice +[[autodoc]] transformers.RoCBertForMultipleChoice + - forward +RoCBertForTokenClassification +[[autodoc]] transformers.RoCBertForTokenClassification + - forward +RoCBertForQuestionAnswering +[[autodoc]] RoCBertForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roc_bert/chunk_4.txt b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..08b2f30f6aad1776cacca156362266044798e2d3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_4.txt @@ -0,0 +1,3 @@ +In this work, we propose +ROCBERT: a pretrained Chinese Bert that is robust to various forms of adversarial attacks like word perturbation, +synonyms, typos, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roc_bert/chunk_5.txt b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..afd796920714654d2738eb4cfd930f367e240042 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_5.txt @@ -0,0 +1,2 @@ +It is pretrained with the contrastive learning objective which maximizes the label consistency +under different synthesized adversarial examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roc_bert/chunk_6.txt b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..1007eac6664161d79956235aa96dbdab1d9076c3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_6.txt @@ -0,0 +1,2 @@ +The model takes as input multimodal information including the +semantic, phonetic and visual features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roc_bert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..850f8cb5e7a9c4bcc0124651844fe4ff4f39825b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_7.txt @@ -0,0 +1,2 @@ +We show all these features are important to the model robustness since the +attack can be performed in all the three forms. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roc_bert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7d71a9a07807986556dbc6f7622cdc98010fc12 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_8.txt @@ -0,0 +1,2 @@ +Across 5 Chinese NLU tasks, ROCBERT outperforms strong baselines under +three blackbox adversarial algorithms without sacrificing the performance on clean testset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roc_bert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..f312a794fe4597f669be9cc3aa4f8f99da32820d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roc_bert/chunk_9.txt @@ -0,0 +1,2 @@ +It also performs the best +in the toxic content detection task under human-made attacks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_roformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..abde085d56c359d63d2eac7471ed32e2ee009a7f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roformer/chunk_10.txt @@ -0,0 +1,2 @@ +Usage tips +RoFormer is a BERT-like autoencoding model with rotary position embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_roformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..19918510fca04623aaef2bba291d696afc161de5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roformer/chunk_11.txt @@ -0,0 +1,2 @@ +Rotary position embeddings have shown +improved performance on classification tasks with long texts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_roformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..59b169825154a654d0622ddeec1d3f40fa9e9b10 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roformer/chunk_12.txt @@ -0,0 +1,83 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +RoFormerConfig +[[autodoc]] RoFormerConfig +RoFormerTokenizer +[[autodoc]] RoFormerTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +RoFormerTokenizerFast +[[autodoc]] RoFormerTokenizerFast + - build_inputs_with_special_tokens + +RoFormerModel +[[autodoc]] RoFormerModel + - forward +RoFormerForCausalLM +[[autodoc]] RoFormerForCausalLM + - forward +RoFormerForMaskedLM +[[autodoc]] RoFormerForMaskedLM + - forward +RoFormerForSequenceClassification +[[autodoc]] RoFormerForSequenceClassification + - forward +RoFormerForMultipleChoice +[[autodoc]] RoFormerForMultipleChoice + - forward +RoFormerForTokenClassification +[[autodoc]] RoFormerForTokenClassification + - forward +RoFormerForQuestionAnswering +[[autodoc]] RoFormerForQuestionAnswering + - forward + +TFRoFormerModel +[[autodoc]] TFRoFormerModel + - call +TFRoFormerForMaskedLM +[[autodoc]] TFRoFormerForMaskedLM + - call +TFRoFormerForCausalLM +[[autodoc]] TFRoFormerForCausalLM + - call +TFRoFormerForSequenceClassification +[[autodoc]] TFRoFormerForSequenceClassification + - call +TFRoFormerForMultipleChoice +[[autodoc]] TFRoFormerForMultipleChoice + - call +TFRoFormerForTokenClassification +[[autodoc]] TFRoFormerForTokenClassification + - call +TFRoFormerForQuestionAnswering +[[autodoc]] TFRoFormerForQuestionAnswering + - call + +FlaxRoFormerModel +[[autodoc]] FlaxRoFormerModel + - call +FlaxRoFormerForMaskedLM +[[autodoc]] FlaxRoFormerForMaskedLM + - call +FlaxRoFormerForSequenceClassification +[[autodoc]] FlaxRoFormerForSequenceClassification + - call +FlaxRoFormerForMultipleChoice +[[autodoc]] FlaxRoFormerForMultipleChoice + - call +FlaxRoFormerForTokenClassification +[[autodoc]] FlaxRoFormerForTokenClassification + - call +FlaxRoFormerForQuestionAnswering +[[autodoc]] FlaxRoFormerForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roformer/chunk_5.txt b/chunked/content_aware_chunking/model_doc_roformer/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..afaee2c8e849a833e5c980660e98a499eed02fc1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roformer/chunk_5.txt @@ -0,0 +1,2 @@ +As a result, the enhanced +transformer with rotary position embedding, or RoFormer, achieves superior performance in tasks with long texts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roformer/chunk_6.txt b/chunked/content_aware_chunking/model_doc_roformer/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..afbace967321047752cc40eeef678c0abbdfbb97 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roformer/chunk_6.txt @@ -0,0 +1,2 @@ +We +release the theoretical analysis along with some preliminary experiment results on Chinese data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roformer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_roformer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8a5a0ae112af55efd1ec44e9c1a9979c465f652 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roformer/chunk_7.txt @@ -0,0 +1,2 @@ +The undergoing +experiment for English benchmark will soon be updated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_roformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd070b93f93196dcff59ede6e2558ab455100b1a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roformer/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by junnyu. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_roformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_roformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_roformer/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rwkv/chunk_10.txt b/chunked/content_aware_chunking/model_doc_rwkv/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..894c899fefe3d9a7fe0f82eeff36d9ed10d225e8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rwkv/chunk_10.txt @@ -0,0 +1,3 @@ +\(W\) is a new vector that represents the position of the token and is given by +$$W_{0} = u \hbox{ and } W_{k} = (k-1)w \hbox{ for } k \geq 1$$ +with \(u\) and \(w\) learnable parameters called in the code time_first and time_decay respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rwkv/chunk_11.txt b/chunked/content_aware_chunking/model_doc_rwkv/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c682ec0d8571f584d06d3f90932aa290ce3f0a8a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rwkv/chunk_11.txt @@ -0,0 +1 @@ +The numerator and denominator can both be expressed recursively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rwkv/chunk_12.txt b/chunked/content_aware_chunking/model_doc_rwkv/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4031f1693b5fb5594edd51ec29118042f600281 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rwkv/chunk_12.txt @@ -0,0 +1,9 @@ +Naming them \(N_{i}\) and \(D_{i}\) we have: +$$N_{i} = e^{u + K_{i}} V_{i} + \hat{N}{i} \hbox{ where } \hat{N}{i} = e^{K_{i-1}} V_{i-1} + e^{w + K_{i-2}} V_{i-2} \cdots + e^{(i-2)w + K_{1}} V_{1}$$ +so \(\hat{N}_{i}\) (called numerator_state in the code) satisfies +$$\hat{N}{0} = 0 \hbox{ and } \hat{N}{j+1} = e^{K_{j}} V_{j} + e^{w} \hat{N}_{j}$$ +and +$$D_{i} = e^{u + K_{i}} + \hat{D}{i} \hbox{ where } \hat{D}{i} = e^{K_{i-1}} + e^{w + K_{i-2}} \cdots + e^{(i-2)w + K_{1}}$$ +so \(\hat{D}_{i}\) (called denominator_state in the code) satisfies +$$\hat{D}{0} = 0 \hbox{ and } \hat{D}{j+1} = e^{K_{j}} + e^{w} \hat{D}_{j}$$ +The actual recurrent formula used are a tiny bit more complex, as for numerical stability we don't want to compute exponentials of big numbers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rwkv/chunk_13.txt b/chunked/content_aware_chunking/model_doc_rwkv/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..20896eb10b9a03d780415aa0184d019f0a490bd0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rwkv/chunk_13.txt @@ -0,0 +1,3 @@ +Usually the softmax is not computed as is, but the exponential of the maximum term is divided of the numerator and denominator: +$$\frac{e^{x_{i}}}{\sum_{j=1}^{n} e^{x_{j}}} = \frac{e^{x_{i} - M}}{\sum_{j=1}^{n} e^{x_{j} - M}}$$ +with \(M\) the maximum of all \(x_{j}\). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rwkv/chunk_14.txt b/chunked/content_aware_chunking/model_doc_rwkv/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..39e4b7ae714dc783f1242749a6bb246a0f5c762b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rwkv/chunk_14.txt @@ -0,0 +1 @@ +So here on top of saving the numerator state (\(\hat{N}\)) and the denominator state (\(\hat{D}\)) we also keep track of the maximum of all terms encountered in the exponentials. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rwkv/chunk_15.txt b/chunked/content_aware_chunking/model_doc_rwkv/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..af25e9b66c74c0c3857c695af6b191854eda03dd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rwkv/chunk_15.txt @@ -0,0 +1,7 @@ +So we actually use +$$\tilde{N}{i} = e^{-M{i}} \hat{N}{i} \hbox{ and } \tilde{D}{i} = e^{-M_{i}} \hat{D}_{i}$$ +defined by the following recurrent formulas: +$$\tilde{N}{0} = 0 \hbox{ and } \tilde{N}{j+1} = e^{K_{j} - q} V_{j} + e^{w + M_{j} - q} \tilde{N}{j} \hbox{ where } q = \max(K{j}, w + M_{j})$$ +and +$$\tilde{D}{0} = 0 \hbox{ and } \tilde{D}{j+1} = e^{K_{j} - q} + e^{w + M_{j} - q} \tilde{D}{j} \hbox{ where } q = \max(K{j}, w + M_{j})$$ +and \(M_{j+1} = q\). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rwkv/chunk_16.txt b/chunked/content_aware_chunking/model_doc_rwkv/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..ded738c219cb2954483db155da58f407cacf04d5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rwkv/chunk_16.txt @@ -0,0 +1,6 @@ +With those, we can then compute +$$N_{i} = e^{u + K_{i} - q} V_{i} + e^{M_{i}} \tilde{N}{i} \hbox{ where } q = \max(u + K{i}, M_{i})$$ +and +$$D_{i} = e^{u + K_{i} - q} + e^{M_{i}} \tilde{D}{i} \hbox{ where } q = \max(u + K{i}, M_{i})$$ +which finally gives us +$$O_{i} = \sigma(R_{i}) \frac{N_{i}}{D_{i}}$$ \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_rwkv/chunk_9.txt b/chunked/content_aware_chunking/model_doc_rwkv/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1c3e6aa66501ec94a3c9a30072cb9b1f6396cde --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_rwkv/chunk_9.txt @@ -0,0 +1,3 @@ +In comparison, the RWKV attention is given by +$$O_{i} = \sigma(R_{i}) \frac{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}} V_{j}}{\sum_{j=1}^{i} e^{W_{i-j} + K_{j}}}$$ +where \(R\) is a new matrix called receptance by the author, \(K\) and \(V\) are still the key and value (\(\sigma\) here is the sigmoid function). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_10.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e801b6da6f52958fc50490ef4fa55e268b369b5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_10.txt @@ -0,0 +1 @@ +However, at this time of writing this seems to be not supported according to the official repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_11.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c6a21557e911985e1003395f9ad05f82dde9a27 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by ybelkada and ArthurZ. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_12.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_13.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05669afcc92ea56f9748b4fa7ae28bf0eb9a05ae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_13.txt @@ -0,0 +1,21 @@ +Below is an example on how to run mask generation given an image and a 2D point: +thon +import torch +from PIL import Image +import requests +from transformers import SamModel, SamProcessor +device = "cuda" if torch.cuda.is_available() else "cpu" +model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device) +processor = SamProcessor.from_pretrained("facebook/sam-vit-huge") +img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" +raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") +input_points = [[[450, 600]]] # 2D location of a window in the image +inputs = processor(raw_image, input_points=input_points, return_tensors="pt").to(device) +with torch.no_grad(): + outputs = model(**inputs) +masks = processor.image_processor.post_process_masks( + outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu() +) +scores = outputs.iou_scores + +You can also process your own masks alongside the input images in the processor to be passed to the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_14.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ec31bbc4af0a343f225bc057dc221dfd27c79a1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_14.txt @@ -0,0 +1,23 @@ +thon +import torch +from PIL import Image +import requests +from transformers import SamModel, SamProcessor +device = "cuda" if torch.cuda.is_available() else "cpu" +model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device) +processor = SamProcessor.from_pretrained("facebook/sam-vit-huge") +img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" +raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") +mask_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png" +segmentation_map = Image.open(requests.get(mask_url, stream=True).raw).convert("RGB") +input_points = [[[450, 600]]] # 2D location of a window in the image +inputs = processor(raw_image, input_points=input_points, segmentation_maps=mask, return_tensors="pt").to(device) +with torch.no_grad(): + outputs = model(**inputs) +masks = processor.image_processor.post_process_masks( + outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu() +) +scores = outputs.iou_scores + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SAM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_15.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b25ef28a82bcdd6ad837db18a504bee24379b6fa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_15.txt @@ -0,0 +1 @@ +Demo notebook for using the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_16.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..770559d8aa158e90ae62b64fe79f156c1df2f583 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_16.txt @@ -0,0 +1 @@ +Demo notebook for using the automatic mask generation pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_17.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..16e78f216a01d4b990081e1823d72b4de6d86255 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_17.txt @@ -0,0 +1 @@ +Demo notebook for inference with MedSAM, a fine-tuned version of SAM on the medical domain. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_18.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..31a7d0b390ea79d2d3c86602f5cd0722acc428d5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_18.txt @@ -0,0 +1,2 @@ +🌎 +Demo notebook for fine-tuning the model on custom data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_19.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f02f8abb6eb45a6f89c206243abd7cee5860495 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_19.txt @@ -0,0 +1,4 @@ +🌎 + +SlimSAM +SlimSAM, a pruned version of SAM, was proposed in 0.1% Data Makes Segment Anything Slim by Zigeng Chen et al. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_20.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..21ae3df616eb73119e9066b222651719c638d83e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_20.txt @@ -0,0 +1 @@ +SlimSAM reduces the size of the SAM models considerably while maintaining the same performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_21.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..492c7fddd37a4931e7508c0e184a6033195f2ad2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_21.txt @@ -0,0 +1 @@ +Checkpoints can be found on the hub, and they can be used as a drop-in replacement of SAM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_22.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5f994d5b64b4418e01a256fed2e044eddf4a169 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_22.txt @@ -0,0 +1,18 @@ +SamConfig +[[autodoc]] SamConfig +SamVisionConfig +[[autodoc]] SamVisionConfig +SamMaskDecoderConfig +[[autodoc]] SamMaskDecoderConfig +SamPromptEncoderConfig +[[autodoc]] SamPromptEncoderConfig +SamProcessor +[[autodoc]] SamProcessor +SamImageProcessor +[[autodoc]] SamImageProcessor +SamModel +[[autodoc]] SamModel + - forward +TFSamModel +[[autodoc]] TFSamModel + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_7.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b7c2fb6e4ea9c05925feb3691cf2f25036f26a1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_7.txt @@ -0,0 +1,3 @@ +Tips: + +The model predicts binary masks that states the presence or not of the object of interest given an image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_8.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..73ec45c535698ff07f7e3224362058346f1128a4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_8.txt @@ -0,0 +1,2 @@ +The model predicts much better results if input 2D points and/or input bounding boxes are provided +You can prompt multiple points for the same image, and predict a single mask. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sam/chunk_9.txt b/chunked/content_aware_chunking/model_doc_sam/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..12f7c24392732ca1f113646204e98699a219b658 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sam/chunk_9.txt @@ -0,0 +1,2 @@ +Fine-tuning the model is not supported yet +According to the paper, textual input should be also supported. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_15.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..4482bb203db38f384121530fea07ad375f11148e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_15.txt @@ -0,0 +1 @@ +Critically, we evaluated SeamlessM4T on gender bias and added toxicity to assess translation safety. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_16.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bf722e8b892799870a5727f5c4955d05b0c1a08 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_16.txt @@ -0,0 +1,10 @@ +Finally, all contributions in this work are open-sourced and accessible at https://github.com/facebookresearch/seamless_communication +Usage +First, load the processor and a checkpoint of the model: +thon + +from transformers import AutoProcessor, SeamlessM4TModel +processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium") +model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium") + +You can seamlessly use this model on text or on audio, to generated either translated text or translated audio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_17.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..1dd6c9d77b17d0e34ef63539f8dcfe4137cd79dd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_17.txt @@ -0,0 +1,14 @@ +Here is how to use the processor to process text and audio: +thon + +let's load an audio sample from an Arabic speech corpus +from datasets import load_dataset +dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True) +audio_sample = next(iter(dataset))["audio"] +now, process it +audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt") +now, process some English test as well +text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt") + +Speech +[SeamlessM4TModel] can seamlessly generate text or speech with few or no changes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_18.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d774e8559f24c8306a9d0f11a5941170112d163 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_18.txt @@ -0,0 +1,7 @@ +Let's target Russian voice translation: +thon + +audio_array_from_text = model.generate(text_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze() +audio_array_from_audio = model.generate(audio_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze() + +With basically the same code, I've translated English text and Arabic speech to Russian speech samples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_19.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..80fb1afed5a1ca76947ce6f87cf709ea1ffe5853 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_19.txt @@ -0,0 +1,2 @@ +Text +Similarly, you can generate translated text from audio files or from text with the same model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_20.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..56f80b9e314882546562fa1657555741ab7ece37 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_20.txt @@ -0,0 +1 @@ +You only have to pass generate_speech=False to [SeamlessM4TModel.generate]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_21.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..6eb3a9c7e996da35bf3321c022a58b82e3e42d81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_21.txt @@ -0,0 +1 @@ +This time, let's translate to French. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_22.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..78f07d78d1e841e7a049edf162b3cb3c66c7edc9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_22.txt @@ -0,0 +1,11 @@ +thon + +from audio +output_tokens = model.generate(**audio_inputs, tgt_lang="fra", generate_speech=False) +translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) +from text +output_tokens = model.generate(**text_inputs, tgt_lang="fra", generate_speech=False) +translated_text_from_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) + +Tips +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_23.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..85fae9eddc4281828235e3451c8adc77c2831fef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_23.txt @@ -0,0 +1,2 @@ +Use dedicated models +[SeamlessM4TModel] is transformers top level model to generate speech and text, but you can also use dedicated models that perform the task without additional components, thus reducing the memory footprint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_24.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..0911d1878d32d8f5c2a5e2312406d31ecc411e12 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_24.txt @@ -0,0 +1,7 @@ +For example, you can replace the audio-to-audio generation snippet with the model dedicated to the S2ST task, the rest is exactly the same code: +thon + +from transformers import SeamlessM4TForSpeechToSpeech +model = SeamlessM4TForSpeechToSpeech.from_pretrained("facebook/hf-seamless-m4t-medium") + +Or you can replace the text-to-text generation snippet with the model dedicated to the T2TT task, you only have to remove generate_speech=False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_25.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..62fae605e44a476d5df6bb339292a09ae4552cf1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_25.txt @@ -0,0 +1,6 @@ +thon + +from transformers import SeamlessM4TForTextToText +model = SeamlessM4TForTextToText.from_pretrained("facebook/hf-seamless-m4t-medium") + +Feel free to try out [SeamlessM4TForSpeechToText] and [SeamlessM4TForTextToSpeech] as well. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_26.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_26.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_27.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..56736065f3c8a426c730748291dda94711c9aa21 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_27.txt @@ -0,0 +1,2 @@ +Change the speaker identity +You have the possibility to change the speaker used for speech synthesis with the spkr_id argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_28.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c22b961d06805176f423998274cae0b76990419 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_28.txt @@ -0,0 +1 @@ +Some spkr_id works better than other for some languages! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_29.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_29.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_30.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..65a7992e2072731efd6d3961b3f12ffd6e87eba3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_30.txt @@ -0,0 +1,2 @@ +Change the generation strategy +You can use different generation strategies for speech and text generation, e.g .generate(input_ids=input_ids, text_num_beams=4, speech_do_sample=True) which will successively perform beam-search decoding on the text model, and multinomial sampling on the speech model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_31.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd249e29f9fefd5ef6445828f1394f228bd97f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_31.txt @@ -0,0 +1 @@ +4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_32.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..02f3318bde1fe6917ef72af8b31fc0646e9fbbfe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_32.txt @@ -0,0 +1,2 @@ +Generate speech and text at the same time +Use return_intermediate_token_ids=True with [SeamlessM4TModel] to return both speech and text ! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_33.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..5808971760a140bdbbec3a25f1cf719fb723f6cb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_33.txt @@ -0,0 +1,2 @@ +Model architecture +SeamlessM4T features a versatile architecture that smoothly handles the sequential generation of text and speech. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_34.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f3071ea832538fd18ad52caa69172e40b5e6a96 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_34.txt @@ -0,0 +1 @@ +This setup comprises two sequence-to-sequence (seq2seq) models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_35.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fb00236da9288667c67813459b05716ab24595d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_35.txt @@ -0,0 +1 @@ +The first model translates the input modality into translated text, while the second model generates speech tokens, known as "unit tokens," from the translated text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_36.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd888f7dada97fdab9814fe7759c92751ad1e865 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_36.txt @@ -0,0 +1 @@ +Each modality has its own dedicated encoder with a unique architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_37.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..d748bf76829c121d3d24a11037f2fc6f3f1461c8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_37.txt @@ -0,0 +1 @@ +Additionally, for speech output, a vocoder inspired by the HiFi-GAN architecture is placed on top of the second seq2seq model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_38.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..478d200dee1b27111454d3d3f57d9912bbd1a639 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_38.txt @@ -0,0 +1,3 @@ +Here's how the generation process works: + +Input text or speech is processed through its specific encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_39.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..6796c3f27e20ea271227741b886712885973e112 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_39.txt @@ -0,0 +1 @@ +A decoder creates text tokens in the desired language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_40.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc39979010f86ee60bca545fb19253c2b60c2a3c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_40.txt @@ -0,0 +1 @@ +If speech generation is required, the second seq2seq model, following a standard encoder-decoder structure, generates unit tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_41.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f2e3d5b1f18467e7ac5d2cd47e4806f8af126d4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_41.txt @@ -0,0 +1 @@ +These unit tokens are then passed through the final vocoder to produce the actual speech. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_42.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..8684d438402dd4aeaf1427564107c702c5c11527 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_42.txt @@ -0,0 +1 @@ +This model was contributed by ylacombe. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_43.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_43.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_44.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..5189d4b912f472bfcbccffe15cb5406f3793702f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t/chunk_44.txt @@ -0,0 +1,43 @@ +SeamlessM4TModel +[[autodoc]] SeamlessM4TModel + - generate +SeamlessM4TForTextToSpeech +[[autodoc]] SeamlessM4TForTextToSpeech + - generate +SeamlessM4TForSpeechToSpeech +[[autodoc]] SeamlessM4TForSpeechToSpeech + - generate +SeamlessM4TForTextToText +[[autodoc]] transformers.SeamlessM4TForTextToText + - forward + - generate +SeamlessM4TForSpeechToText +[[autodoc]] transformers.SeamlessM4TForSpeechToText + - forward + - generate +SeamlessM4TConfig +[[autodoc]] SeamlessM4TConfig +SeamlessM4TTokenizer +[[autodoc]] SeamlessM4TTokenizer + - call + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +SeamlessM4TTokenizerFast +[[autodoc]] SeamlessM4TTokenizerFast + - call +SeamlessM4TFeatureExtractor +[[autodoc]] SeamlessM4TFeatureExtractor + - call +SeamlessM4TProcessor +[[autodoc]] SeamlessM4TProcessor + - call +SeamlessM4TCodeHifiGan +[[autodoc]] SeamlessM4TCodeHifiGan +SeamlessM4THifiGan +[[autodoc]] SeamlessM4THifiGan +SeamlessM4TTextToUnitModel +[[autodoc]] SeamlessM4TTextToUnitModel +SeamlessM4TTextToUnitForConditionalGeneration +[[autodoc]] SeamlessM4TTextToUnitForConditionalGeneration \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_19.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0ba7aeb1605f03b6847345ef2d63dc8f95d454a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_19.txt @@ -0,0 +1 @@ +Consequently, we bring major components from SeamlessExpressive and SeamlessStreaming together to form Seamless, the first publicly available system that unlocks expressive cross-lingual communication in real-time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_20.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e811b0b7db31747195bb8e4ba1f14917439de50 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_20.txt @@ -0,0 +1 @@ +In sum, Seamless gives us a pivotal look at the technical foundation needed to turn the Universal Speech Translator from a science fiction concept into a real-world technology. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_21.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..71baa69596e548c4838be096247c94029fe210c4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_21.txt @@ -0,0 +1 @@ +Finally, contributions in this work—including models, code, and a watermark detector—are publicly released and accessible at the link below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_22.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf782efecf339d278035a0195941e1d95b0fe2f6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_22.txt @@ -0,0 +1,2 @@ +Usage +In the following example, we'll load an Arabic audio sample and an English text sample and convert them into Russian speech and French text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_23.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad8da0b3067cecf0716c4daf3abf10c11f43430c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_23.txt @@ -0,0 +1,8 @@ +First, load the processor and a checkpoint of the model: +thon + +from transformers import AutoProcessor, SeamlessM4Tv2Model +processor = AutoProcessor.from_pretrained("facebook/seamless-m4t-v2-large") +model = SeamlessM4Tv2Model.from_pretrained("facebook/seamless-m4t-v2-large") + +You can seamlessly use this model on text or on audio, to generated either translated text or translated audio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_24.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..498c0c1a9731b23c98c8b63acfe0191f9964c662 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_24.txt @@ -0,0 +1,14 @@ +Here is how to use the processor to process text and audio: +thon + +let's load an audio sample from an Arabic speech corpus +from datasets import load_dataset +dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True) +audio_sample = next(iter(dataset))["audio"] +now, process it +audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt") +now, process some English text as well +text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt") + +Speech +[SeamlessM4Tv2Model] can seamlessly generate text or speech with few or no changes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_25.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d774e8559f24c8306a9d0f11a5941170112d163 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_25.txt @@ -0,0 +1,7 @@ +Let's target Russian voice translation: +thon + +audio_array_from_text = model.generate(text_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze() +audio_array_from_audio = model.generate(audio_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze() + +With basically the same code, I've translated English text and Arabic speech to Russian speech samples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_26.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..80fb1afed5a1ca76947ce6f87cf709ea1ffe5853 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_26.txt @@ -0,0 +1,2 @@ +Text +Similarly, you can generate translated text from audio files or from text with the same model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_27.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..476d910cdd113f059d255a98d07bda438461bd62 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_27.txt @@ -0,0 +1 @@ +You only have to pass generate_speech=False to [SeamlessM4Tv2Model.generate]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_28.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..6eb3a9c7e996da35bf3321c022a58b82e3e42d81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_28.txt @@ -0,0 +1 @@ +This time, let's translate to French. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_29.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..78f07d78d1e841e7a049edf162b3cb3c66c7edc9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_29.txt @@ -0,0 +1,11 @@ +thon + +from audio +output_tokens = model.generate(**audio_inputs, tgt_lang="fra", generate_speech=False) +translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) +from text +output_tokens = model.generate(**text_inputs, tgt_lang="fra", generate_speech=False) +translated_text_from_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True) + +Tips +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_30.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfec89b4254a65481466d9dd4898217462be353d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_30.txt @@ -0,0 +1,2 @@ +Use dedicated models +[SeamlessM4Tv2Model] is transformers top level model to generate speech and text, but you can also use dedicated models that perform the task without additional components, thus reducing the memory footprint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_31.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9020480510fd2c24b64c8de547de33a0945eb90 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_31.txt @@ -0,0 +1,7 @@ +For example, you can replace the audio-to-audio generation snippet with the model dedicated to the S2ST task, the rest is exactly the same code: +thon + +from transformers import SeamlessM4Tv2ForSpeechToSpeech +model = SeamlessM4Tv2ForSpeechToSpeech.from_pretrained("facebook/seamless-m4t-v2-large") + +Or you can replace the text-to-text generation snippet with the model dedicated to the T2TT task, you only have to remove generate_speech=False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_32.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..505f9b082d4c17d112bf0f950750ee3cafe31996 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_32.txt @@ -0,0 +1,6 @@ +thon + +from transformers import SeamlessM4Tv2ForTextToText +model = SeamlessM4Tv2ForTextToText.from_pretrained("facebook/seamless-m4t-v2-large") + +Feel free to try out [SeamlessM4Tv2ForSpeechToText] and [SeamlessM4Tv2ForTextToSpeech] as well. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_33.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_33.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_34.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..9651a28e76cf4b5e2abcd4fecde2299b61648eb8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_34.txt @@ -0,0 +1,2 @@ +Change the speaker identity +You have the possibility to change the speaker used for speech synthesis with the speaker_id argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_35.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..32a13dbda0a297cffa0a25a934be24da6cb007e3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_35.txt @@ -0,0 +1 @@ +Some speaker_id works better than other for some languages! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_36.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_36.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_37.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d5f0d2bc6cb8955473fbc77c8ffb1be81d7ec6c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_37.txt @@ -0,0 +1,2 @@ +Change the generation strategy +You can use different generation strategies for text generation, e.g .generate(input_ids=input_ids, text_num_beams=4, text_do_sample=True) which will perform multinomial beam-search decoding on the text model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_38.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fd931ba3e5c40e6fe1edf4e26d0d5a7013e7671 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_38.txt @@ -0,0 +1 @@ +Note that speech generation only supports greedy - by default - or multinomial sampling, which can be used with e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_39.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..773379dd7abf17268e0fd6caf057b3a2e1ce0ede --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_39.txt @@ -0,0 +1 @@ +.generate(, speech_do_sample=True, speech_temperature=0.6). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_40.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd249e29f9fefd5ef6445828f1394f228bd97f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_40.txt @@ -0,0 +1 @@ +4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_41.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..cee842a1d7b35a5ede8e6174777c364b709738e4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_41.txt @@ -0,0 +1,2 @@ +Generate speech and text at the same time +Use return_intermediate_token_ids=True with [SeamlessM4Tv2Model] to return both speech and text ! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_42.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..25e3726ed3e0a004e1444bd58d7d304cfc3e0f2c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_42.txt @@ -0,0 +1,2 @@ +Model architecture +SeamlessM4T-v2 features a versatile architecture that smoothly handles the sequential generation of text and speech. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_43.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f3071ea832538fd18ad52caa69172e40b5e6a96 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_43.txt @@ -0,0 +1 @@ +This setup comprises two sequence-to-sequence (seq2seq) models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_44.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fb00236da9288667c67813459b05716ab24595d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_44.txt @@ -0,0 +1 @@ +The first model translates the input modality into translated text, while the second model generates speech tokens, known as "unit tokens," from the translated text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_45.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd888f7dada97fdab9814fe7759c92751ad1e865 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_45.txt @@ -0,0 +1 @@ +Each modality has its own dedicated encoder with a unique architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_46.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..d748bf76829c121d3d24a11037f2fc6f3f1461c8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_46.txt @@ -0,0 +1 @@ +Additionally, for speech output, a vocoder inspired by the HiFi-GAN architecture is placed on top of the second seq2seq model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_47.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8d60c6559e95bc3974c7e3ea5db461890761e5d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_47.txt @@ -0,0 +1,4 @@ +Difference with SeamlessM4T-v1 +The architecture of this new version differs from the first in a few aspects: +Improvements on the second-pass model +The second seq2seq model, named text-to-unit model, is now non-auto regressive, meaning that it computes units in a single forward pass. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_48.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..e838ba2c30ec4fbece6af4271d182c8f6c4df9d0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_48.txt @@ -0,0 +1,2 @@ +This achievement is made possible by: +- the use of character-level embeddings, meaning that each character of the predicted translated text has its own embeddings, which are then used to predict the unit tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_49.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..40fcfa39a76bd31a13e6f94be1072b2ab6696e8d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_49.txt @@ -0,0 +1 @@ +- the use of an intermediate duration predictor, that predicts speech duration at the character-level on the predicted translated text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_50.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..f86b50164a5307c0a33d95f9f7432baa8151729e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_50.txt @@ -0,0 +1 @@ +- the use of a new text-to-unit decoder mixing convolutions and self-attention to handle longer context. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_51.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..9817adb728805696165d88594fa725529859fe46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_51.txt @@ -0,0 +1,3 @@ +Difference in the speech encoder +The speech encoder, which is used during the first-pass generation process to predict the translated text, differs mainly from the previous speech encoder through these mechanisms: +- the use of chunked attention mask to prevent attention across chunks, ensuring that each position attends only to positions within its own chunk and a fixed number of previous chunks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_52.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4cc1b4fc1657a99c02e4b29db8b3b6ec33ca7aa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_52.txt @@ -0,0 +1 @@ +- the use of relative position embeddings which only considers distance between sequence elements rather than absolute positions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_53.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..00798161c668c9d3a189b652ff43c13cfbbcc253 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_53.txt @@ -0,0 +1 @@ +Please refer to Self-Attentionwith Relative Position Representations (Shaw et al.) \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_54.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..709d1f80aed3e37424e9e1931308e81102b4e722 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_54.txt @@ -0,0 +1 @@ +for more details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_55.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e9aa0a5191b0869280861895a535c7f52493c15 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_55.txt @@ -0,0 +1 @@ +- the use of a causal depth-wise convolution instead of a non-causal one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_56.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bb0e05d7047a0a276494e2c6c65eaf973a91445 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_56.txt @@ -0,0 +1,4 @@ +Generation process +Here's how the generation process works: + +Input text or speech is processed through its specific encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_57.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..6796c3f27e20ea271227741b886712885973e112 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_57.txt @@ -0,0 +1 @@ +A decoder creates text tokens in the desired language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_58.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..aaf048b1dae23beea30fb9e89efad91d7c8108b2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_58.txt @@ -0,0 +1 @@ +If speech generation is required, the second seq2seq model, generates unit tokens in an non auto-regressive way. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_59.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f2e3d5b1f18467e7ac5d2cd47e4806f8af126d4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_59.txt @@ -0,0 +1 @@ +These unit tokens are then passed through the final vocoder to produce the actual speech. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_60.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..8684d438402dd4aeaf1427564107c702c5c11527 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_60.txt @@ -0,0 +1 @@ +This model was contributed by ylacombe. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_61.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_61.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_62.txt b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5ff56a7704baa1b9705b96d9e5b1726bd05fd67 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_seamless_m4t_v2/chunk_62.txt @@ -0,0 +1,19 @@ +SeamlessM4Tv2Model +[[autodoc]] SeamlessM4Tv2Model + - generate +SeamlessM4Tv2ForTextToSpeech +[[autodoc]] SeamlessM4Tv2ForTextToSpeech + - generate +SeamlessM4Tv2ForSpeechToSpeech +[[autodoc]] SeamlessM4Tv2ForSpeechToSpeech + - generate +SeamlessM4Tv2ForTextToText +[[autodoc]] transformers.SeamlessM4Tv2ForTextToText + - forward + - generate +SeamlessM4Tv2ForSpeechToText +[[autodoc]] transformers.SeamlessM4Tv2ForSpeechToText + - forward + - generate +SeamlessM4Tv2Config +[[autodoc]] SeamlessM4Tv2Config \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7576199efb235e3e2d35e891fb842c77e65c34c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_14.txt @@ -0,0 +1,2 @@ +The TensorFlow version +of the model was contributed by sayakpaul. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_15.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..5bd11dca4787c7c094c1e848dcde63841fc09b6f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_16.txt @@ -0,0 +1,3 @@ +Usage tips + +SegFormer consists of a hierarchical Transformer encoder, and a lightweight all-MLP decoder head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_17.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cbca774de4fbfa54dcbbf50e6c95682e70ae293 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_17.txt @@ -0,0 +1,2 @@ +[SegformerModel] is the hierarchical Transformer encoder (which in the paper is also referred to + as Mix Transformer or MiT). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_18.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..07073aee55d7e6b04052de1f48051c3138b44b24 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_18.txt @@ -0,0 +1,2 @@ +[SegformerForSemanticSegmentation] adds the all-MLP decoder head on + top to perform semantic segmentation of images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_19.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1754de15cfc17d851457cd847ad8cba66e24424 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_19.txt @@ -0,0 +1,2 @@ +In addition, there's + [SegformerForImageClassification] which can be used to - you guessed it - classify images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_20.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..589345cf8efe3493403901a03f623f23a18f70c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_20.txt @@ -0,0 +1,2 @@ +The + authors of SegFormer first pre-trained the Transformer encoder on ImageNet-1k to classify images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_21.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b3a202c2ee6e94e207dfe6e6823e30c215d4a60 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_21.txt @@ -0,0 +1,2 @@ +Next, they throw + away the classification head, and replace it by the all-MLP decode head. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_22.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..6de19e40c9766ae55dd81f5e724ec134788d2215 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_22.txt @@ -0,0 +1,2 @@ +Next, they fine-tune the model altogether on + ADE20K, Cityscapes and COCO-stuff, which are important benchmarks for semantic segmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_23.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a9001666b2c1b031ceb7b2d2037169db5038a04 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_23.txt @@ -0,0 +1,2 @@ +All checkpoints can be + found on the hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_24.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8def65be9cdf8e4ca2c499cfb0907740de4af4c2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_24.txt @@ -0,0 +1,2 @@ +The quickest way to get started with SegFormer is by checking the example notebooks (which showcase both inference and + fine-tuning on custom data). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_25.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7579c556657197b969d7010305821fbbf1d06a3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_25.txt @@ -0,0 +1 @@ +One can also check out the blog post introducing SegFormer and illustrating how it can be fine-tuned on custom data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_26.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..df4f3f8e71da4eea4728aa73d91c9417870fe774 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_26.txt @@ -0,0 +1 @@ +TensorFlow users should refer to this repository that shows off-the-shelf inference and fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_27.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee3b78603d8553f4f7af7c95157d2ed7a85a46ad --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_27.txt @@ -0,0 +1,2 @@ +One can also check out this interactive demo on Hugging Face Spaces + to try out a SegFormer model on custom images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_28.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..de6ddd90d2e2bf520311c7970107827349acb517 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_28.txt @@ -0,0 +1 @@ +SegFormer works on any input size, as it pads the input to be divisible by config.patch_sizes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_29.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..f595570ece1a89bbd9e6b7e31e5c8b9b3d6f2f9b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_29.txt @@ -0,0 +1,2 @@ +One can use [SegformerImageProcessor] to prepare images and corresponding segmentation maps + for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_30.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..b171852aefae9d95b432549ebcb3059117a361e0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_30.txt @@ -0,0 +1,2 @@ +Note that this image processor is fairly basic and does not include all data augmentations used in + the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_31.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..450915b5814bc77fbcb8f91fba5daa912be651d8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_31.txt @@ -0,0 +1 @@ +The original preprocessing pipelines (for the ADE20k dataset for instance) can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_32.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..40231e0785edefcff58d288d2bbbb0c0d6c5af9c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_32.txt @@ -0,0 +1,3 @@ +The most + important preprocessing step is that images and segmentation maps are randomly cropped and padded to the same size, + such as 512x512 or 640x640, after which they are normalized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_33.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..fff936238375082d28764a495146c993b77e5614 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_33.txt @@ -0,0 +1,2 @@ +One additional thing to keep in mind is that one can initialize [SegformerImageProcessor] with + reduce_labels set to True or False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_34.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..48e5c9fcd881c802a4984bec1c10a7c086ee86e2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_34.txt @@ -0,0 +1,2 @@ +In some datasets (like ADE20k), the 0 index is used in the annotated + segmentation maps for background. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_35.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d91884a4a3feb64662c34c2c8d671291fec33b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_35.txt @@ -0,0 +1 @@ +However, ADE20k doesn't include the "background" class in its 150 labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_36.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..84fba32b9844433ea48e7fa009bcb50d069b146f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_36.txt @@ -0,0 +1,2 @@ +Therefore, reduce_labels is used to reduce all labels by 1, and to make sure no loss is computed for the + background class (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_37.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..79c1d7a6dae4814a4932cf46f0b369c88fa7bad0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_37.txt @@ -0,0 +1,2 @@ +it replaces 0 in the annotated maps by 255, which is the ignore_index of the loss function + used by [SegformerForSemanticSegmentation]). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_38.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..90bdf3d5f4068d5317e6cd7378baae41e0964c76 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_38.txt @@ -0,0 +1,2 @@ +However, other datasets use the 0 index as + background class and include this class as part of all labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_39.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0ffa19ddd2cefaf7f691c83c731a916667374c5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_39.txt @@ -0,0 +1,2 @@ +In that case, reduce_labels should be set to + False, as loss should also be computed for the background class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_40.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..3951904f83fbf88f7c8255d4e8f3cdc59e3ea43b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_40.txt @@ -0,0 +1,2 @@ +As most models, SegFormer comes in different sizes, the details of which can be found in the table below + (taken from Table 7 of the original paper). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_41.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..73050240a6907552b33d4583ab241e1fc7d7ed22 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_41.txt @@ -0,0 +1,9 @@ +| Model variant | Depths | Hidden sizes | Decoder hidden size | Params (M) | ImageNet-1k Top 1 | +| :---------------: | ------------- | ------------------- | :---------------------: | :------------: | :-------------------: | +| MiT-b0 | [2, 2, 2, 2] | [32, 64, 160, 256] | 256 | 3.7 | 70.5 | +| MiT-b1 | [2, 2, 2, 2] | [64, 128, 320, 512] | 256 | 14.0 | 78.7 | +| MiT-b2 | [3, 4, 6, 3] | [64, 128, 320, 512] | 768 | 25.4 | 81.6 | +| MiT-b3 | [3, 4, 18, 3] | [64, 128, 320, 512] | 768 | 45.2 | 83.1 | +| MiT-b4 | [3, 8, 27, 3] | [64, 128, 320, 512] | 768 | 62.6 | 83.6 | +| MiT-b5 | [3, 6, 40, 3] | [64, 128, 320, 512] | 768 | 82.0 | 83.8 | +Note that MiT in the above table refers to the Mix Transformer encoder backbone introduced in SegFormer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_42.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..790f63289f699c0864b709d4443518b857d9e056 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_42.txt @@ -0,0 +1,2 @@ +For +SegFormer's results on the segmentation datasets like ADE20k, refer to the paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_43.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fc12969ccb07e70eb75e6fbe65d630ff369b9c5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_43.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SegFormer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_44.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..111f5aaf64491f6192289c0680acb02f7e768091 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_44.txt @@ -0,0 +1 @@ +[SegformerForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_45.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..03704541777611827632de0d2e6c48af72d3b800 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_45.txt @@ -0,0 +1,5 @@ +Image classification task guide + +Semantic segmentation: + +[SegformerForSemanticSegmentation] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_46.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6cb1aa90404a05404960807174874fa62a98aac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_46.txt @@ -0,0 +1 @@ +A blog on fine-tuning SegFormer on a custom dataset can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_47.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbaf28cf27f2730cf642e7df814e6d60e92869eb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_47.txt @@ -0,0 +1 @@ +More demo notebooks on SegFormer (both inference + fine-tuning on a custom dataset) can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_48.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c2369a3fd0df32972c5dfab470fc99991ab3bee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_48.txt @@ -0,0 +1 @@ +[TFSegformerForSemanticSegmentation] is supported by this example notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_49.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..944de10e2fee305fa6fd5b548c032af31e3faa61 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_49.txt @@ -0,0 +1,3 @@ +Semantic segmentation task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_50.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_50.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_segformer/chunk_51.txt b/chunked/content_aware_chunking/model_doc_segformer/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..209787b474c7ba3d7bab1e4025705d721c0aed4b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_segformer/chunk_51.txt @@ -0,0 +1,36 @@ +SegformerConfig +[[autodoc]] SegformerConfig +SegformerFeatureExtractor +[[autodoc]] SegformerFeatureExtractor + - call + - post_process_semantic_segmentation +SegformerImageProcessor +[[autodoc]] SegformerImageProcessor + - preprocess + - post_process_semantic_segmentation + +SegformerModel +[[autodoc]] SegformerModel + - forward +SegformerDecodeHead +[[autodoc]] SegformerDecodeHead + - forward +SegformerForImageClassification +[[autodoc]] SegformerForImageClassification + - forward +SegformerForSemanticSegmentation +[[autodoc]] SegformerForSemanticSegmentation + - forward + +TFSegformerDecodeHead +[[autodoc]] TFSegformerDecodeHead + - call +TFSegformerModel +[[autodoc]] TFSegformerModel + - call +TFSegformerForImageClassification +[[autodoc]] TFSegformerForImageClassification + - call +TFSegformerForSemanticSegmentation +[[autodoc]] TFSegformerForSemanticSegmentation + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sew-d/chunk_4.txt b/chunked/content_aware_chunking/model_doc_sew-d/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..38c904b6b3ade08d98ca005d99b1ec9a1236dcec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sew-d/chunk_4.txt @@ -0,0 +1,2 @@ +For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x +inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sew-d/chunk_5.txt b/chunked/content_aware_chunking/model_doc_sew-d/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f88d2057272d24902a845b44b88e201601a7020 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sew-d/chunk_5.txt @@ -0,0 +1,2 @@ +With a similar inference +time, SEW reduces word error rate by 25-50% across different model sizes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sew-d/chunk_6.txt b/chunked/content_aware_chunking/model_doc_sew-d/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..12fefd17a7bece4dcfbec45db0f79359a0a9f033 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sew-d/chunk_6.txt @@ -0,0 +1,4 @@ +This model was contributed by anton-l. +Usage tips + +SEW-D is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sew-d/chunk_7.txt b/chunked/content_aware_chunking/model_doc_sew-d/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..020befd832e2390ac986fc6d2a0db13cf2d20543 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sew-d/chunk_7.txt @@ -0,0 +1,2 @@ +SEWDForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded + using [Wav2Vec2CTCTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sew-d/chunk_8.txt b/chunked/content_aware_chunking/model_doc_sew-d/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d8dedc745a081295538c0291dbd31f9f2e21e81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sew-d/chunk_8.txt @@ -0,0 +1,16 @@ +Resources + +Audio classification task guide +Automatic speech recognition task guide + +SEWDConfig +[[autodoc]] SEWDConfig +SEWDModel +[[autodoc]] SEWDModel + - forward +SEWDForCTC +[[autodoc]] SEWDForCTC + - forward +SEWDForSequenceClassification +[[autodoc]] SEWDForSequenceClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sew/chunk_4.txt b/chunked/content_aware_chunking/model_doc_sew/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..38c904b6b3ade08d98ca005d99b1ec9a1236dcec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sew/chunk_4.txt @@ -0,0 +1,2 @@ +For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x +inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sew/chunk_5.txt b/chunked/content_aware_chunking/model_doc_sew/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f88d2057272d24902a845b44b88e201601a7020 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sew/chunk_5.txt @@ -0,0 +1,2 @@ +With a similar inference +time, SEW reduces word error rate by 25-50% across different model sizes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sew/chunk_6.txt b/chunked/content_aware_chunking/model_doc_sew/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..d53e1019377eb67c1705285bdff91ec144367349 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sew/chunk_6.txt @@ -0,0 +1,4 @@ +This model was contributed by anton-l. +Usage tips + +SEW is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sew/chunk_7.txt b/chunked/content_aware_chunking/model_doc_sew/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f4402d2a47f004838a400d3cbdae2e9478450a2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sew/chunk_7.txt @@ -0,0 +1,2 @@ +SEWForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded using + [Wav2Vec2CTCTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_sew/chunk_8.txt b/chunked/content_aware_chunking/model_doc_sew/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a087665ed0fc2ef0927951c6b8798ae932b14363 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_sew/chunk_8.txt @@ -0,0 +1,16 @@ +Resources + +Audio classification task guide +Automatic speech recognition task guide + +SEWConfig +[[autodoc]] SEWConfig +SEWModel +[[autodoc]] SEWModel + - forward +SEWForCTC +[[autodoc]] SEWForCTC + - forward +SEWForSequenceClassification +[[autodoc]] SEWForSequenceClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_10.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d0cf222b34d872f4501dcae74cf8bcba6e5b9f6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_10.txt @@ -0,0 +1 @@ +The main difference is the training loss, which does not require a global view of all the pairwise similarities of images and texts within a batch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_11.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..080a7382b653964ff8a0e3c543d677d1c8934647 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_11.txt @@ -0,0 +1 @@ +One needs to apply the sigmoid activation function to the logits, rather than the softmax. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_12.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7f874180cd7bfbd4668201f56df5ec1579253eb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_12.txt @@ -0,0 +1 @@ +Training is not yet supported. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_13.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..a344733fc09f71b831b2be4aeeb52a53689a8ae0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_13.txt @@ -0,0 +1 @@ +If you want to fine-tune SigLIP or train from scratch, refer to the loss function from OpenCLIP, which leverages various torch.distributed utilities. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_14.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..80624a5695f58d522972c9db3be22da36cef066c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_14.txt @@ -0,0 +1 @@ +When using the standalone [SiglipTokenizer] or [SiglipProcessor], make sure to pass padding="max_length" as that's how the model was trained. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_15.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6215b002e96f01bd43a13749ecf2bd112f659eab --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_15.txt @@ -0,0 +1 @@ +SigLIP evaluation results compared to CLIP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_16.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_16.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_17.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_17.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_18.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_18.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_19.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c9a0ac32cb4c866db3c8dfae36b4fef406a3347 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_19.txt @@ -0,0 +1,2 @@ +Usage example +There are 2 main ways to use SigLIP: either using the pipeline API, which abstracts away all the complexity for you, or by using the SiglipModel class yourself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_20.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..8932f23de3055fdcfea24a591158b4fddd04a34b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_20.txt @@ -0,0 +1,42 @@ +Pipeline API +The pipeline allows to use the model in a few lines of code: +thon + +from transformers import pipeline +from PIL import Image +import requests +load pipe +image_classifier = pipeline(task="zero-shot-image-classification", model="google/siglip-base-patch16-224") +load image +url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +image = Image.open(requests.get(url, stream=True).raw) +inference +outputs = image_classifier(image, candidate_labels=["2 cats", "a plane", "a remote"]) +outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs] +print(outputs) +[{'score': 0.1979, 'label': '2 cats'}, {'score': 0.0, 'label': 'a remote'}, {'score': 0.0, 'label': 'a plane'}] + +Using the model yourself +If you want to do the pre- and postprocessing yourself, here's how to do that: +thon + +from PIL import Image +import requests +from transformers import AutoProcessor, AutoModel +import torch +model = AutoModel.from_pretrained("google/siglip-base-patch16-224") +processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +texts = ["a photo of 2 cats", "a photo of 2 dogs"] +important: we pass padding=max_length since the model was trained with this +inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt") +with torch.no_grad(): + outputs = model(**inputs) +logits_per_image = outputs.logits_per_image +probs = torch.sigmoid(logits_per_image) # these are the probabilities +print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'") +31.9% that image 0 is 'a photo of 2 cats' + +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SigLIP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_21.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..09e0847710354f51063982f05e3badc3b0a0570f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_21.txt @@ -0,0 +1,2 @@ +Zero-shot image classification task guide +Demo notebooks for SigLIP can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_22.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..b31de82274061c061863f7d69a83c223187d5418 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_22.txt @@ -0,0 +1,3 @@ +🌎 + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_23.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_23.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_24.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..237981f9f6a936fa801cadd7578e0b2882eee379 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_24.txt @@ -0,0 +1,32 @@ +SiglipConfig +[[autodoc]] SiglipConfig + - from_text_vision_configs +SiglipTextConfig +[[autodoc]] SiglipTextConfig +SiglipVisionConfig +[[autodoc]] SiglipVisionConfig +SiglipTokenizer +[[autodoc]] SiglipTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +SiglipImageProcessor +[[autodoc]] SiglipImageProcessor + - preprocess +SiglipProcessor +[[autodoc]] SiglipProcessor +SiglipModel +[[autodoc]] SiglipModel + - forward + - get_text_features + - get_image_features +SiglipTextModel +[[autodoc]] SiglipTextModel + - forward +SiglipVisionModel +[[autodoc]] SiglipVisionModel + - forward +SiglipForImageClassification +[[autodoc]] SiglipForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_7.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f13ca362328168807573e6993ba0ff934800c2a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_7.txt @@ -0,0 +1 @@ +The disentanglement of the batch size from the loss further allows us to study the impact of examples vs pairs and negative to positive ratio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_8.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a81ab5b2744ccdb990781cf86f3eb168bf2d1b61 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_8.txt @@ -0,0 +1 @@ +Finally, we push the batch size to the extreme, up to one million, and find that the benefits of growing batch size quickly diminish, with a more reasonable batch size of 32k being sufficient. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_siglip/chunk_9.txt b/chunked/content_aware_chunking/model_doc_siglip/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b7e304f9c59bacb7e6bfb9ed4c3bb8109f7e4ae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_siglip/chunk_9.txt @@ -0,0 +1,3 @@ +Usage tips + +Usage of SigLIP is similar to CLIP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_10.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb74d769b1d37c2e3c5683eec0815c8a71ce93d2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_10.txt @@ -0,0 +1 @@ +Note that any pretrained Transformer-based speech model, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_11.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b65acfb0d39195f13e3403bfef61f364137b6b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_11.txt @@ -0,0 +1 @@ +Wav2Vec2, Hubert can serve as the encoder and both pretrained auto-encoding models, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_12.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..9884768ca8473d038554cd6f2053ad009b4870b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_12.txt @@ -0,0 +1 @@ +BERT, pretrained causal language models, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_13.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..26811a35577f522a9e38509e0f4fec14461f14d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_13.txt @@ -0,0 +1 @@ +GPT2, as well as the pretrained decoder part of sequence-to-sequence models, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_14.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..8df167b6ff1390dd23d0cf76ff607af04532696c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_14.txt @@ -0,0 +1 @@ +decoder of BART, can be used as the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_15.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd7d22e2c0189856696b62696c816fec60e2b622 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_15.txt @@ -0,0 +1 @@ +Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_16.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6907f2a22db25f6ac9597f55155489c5f30779c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_16.txt @@ -0,0 +1 @@ +Initializing [SpeechEncoderDecoderModel] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in the Warm-starting-encoder-decoder blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_17.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ac8e3e797a568e9f112383484208cca0e35e430 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_17.txt @@ -0,0 +1 @@ +To do so, the SpeechEncoderDecoderModel class provides a [SpeechEncoderDecoderModel.from_encoder_decoder_pretrained] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_18.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..99e7f5d91180899440afac178e2b10cba75a44d4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_18.txt @@ -0,0 +1,8 @@ +thon + +from transformers import SpeechEncoderDecoderModel +model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained( + "facebook/hubert-large-ll60k", "google-bert/bert-base-uncased" + ) + +Loading an existing SpeechEncoderDecoderModel checkpoint and perform inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_19.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..65889d757fcbd2a74b4756e3429e192d91c84ed0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_19.txt @@ -0,0 +1 @@ +To load fine-tuned checkpoints of the SpeechEncoderDecoderModel class, [SpeechEncoderDecoderModel] provides the from_pretrained() method just like any other model architecture in Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_20.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..e55dc314fb376c66035398fd8c7b4ace794cafd4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_20.txt @@ -0,0 +1 @@ +To perform inference, one uses the [generate] method, which allows to autoregressively generate text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_21.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..f29d5c272c8190c62073648395190c4babe6a276 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_21.txt @@ -0,0 +1 @@ +This method supports various forms of decoding, such as greedy, beam search and multinomial sampling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_22.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..775e64433e4474dc4cd8af22d9f1fbbefc4a0ce5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_22.txt @@ -0,0 +1,16 @@ +thon + +from transformers import Wav2Vec2Processor, SpeechEncoderDecoderModel +from datasets import load_dataset +import torch +load a fine-tuned speech translation model and corresponding processor +model = SpeechEncoderDecoderModel.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") +processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-xls-r-300m-en-to-15") +let's perform inference on a piece of English speech (which we'll translate to German) +ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +input_values = processor(ds[0]["audio"]["array"], return_tensors="pt").input_values +autoregressively generate transcription (uses greedy decoding by default) +generated_ids = model.generate(input_values) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] +print(generated_text) +Mr. Quilter ist der Apostel der Mittelschicht und wir freuen uns, sein Evangelium willkommen heißen zu können. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_23.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..40302549dce685798a398e41f94b15dac6da0881 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_23.txt @@ -0,0 +1,2 @@ +Training +Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (speech, text) pairs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_24.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7705504b5857af434ac01b10d210b392786fdca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_24.txt @@ -0,0 +1,2 @@ +As you can see, only 2 inputs are required for the model in order to compute a loss: input_values (which are the +speech inputs) and labels (which are the input_ids of the encoded target sequence). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_25.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..8076d52008d9b1338b4a6904ec4cb4da89f86bcb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_25.txt @@ -0,0 +1,31 @@ +thon + +from transformers import AutoTokenizer, AutoFeatureExtractor, SpeechEncoderDecoderModel +from datasets import load_dataset +encoder_id = "facebook/wav2vec2-base-960h" # acoustic model encoder +decoder_id = "google-bert/bert-base-uncased" # text decoder +feature_extractor = AutoFeatureExtractor.from_pretrained(encoder_id) +tokenizer = AutoTokenizer.from_pretrained(decoder_id) +Combine pre-trained encoder and pre-trained decoder to form a Seq2Seq model +model = SpeechEncoderDecoderModel.from_encoder_decoder_pretrained(encoder_id, decoder_id) +model.config.decoder_start_token_id = tokenizer.cls_token_id +model.config.pad_token_id = tokenizer.pad_token_id +load an audio input and pre-process (normalise mean/std to 0/1) +ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values +load its corresponding transcription and tokenize to generate labels +labels = tokenizer(ds[0]["text"], return_tensors="pt").input_ids +the forward function automatically creates the correct decoder_input_ids +loss = model(input_values=input_values, labels=labels).loss +loss.backward() + +SpeechEncoderDecoderConfig +[[autodoc]] SpeechEncoderDecoderConfig +SpeechEncoderDecoderModel +[[autodoc]] SpeechEncoderDecoderModel + - forward + - from_encoder_decoder_pretrained +FlaxSpeechEncoderDecoderModel +[[autodoc]] FlaxSpeechEncoderDecoderModel + - call + - from_encoder_decoder_pretrained \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_7.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e02c5909e52a95b6f87199ea8335eee5538a000 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_7.txt @@ -0,0 +1,2 @@ +In the following example, we show how to do this using the default [Wav2Vec2Model] configuration for the encoder +and the default [BertForCausalLM] configuration for the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_8.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7e7037b0fdfa556500b7647d7294db57988509e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_8.txt @@ -0,0 +1,9 @@ +thon + +from transformers import BertConfig, Wav2Vec2Config, SpeechEncoderDecoderConfig, SpeechEncoderDecoderModel +config_encoder = Wav2Vec2Config() +config_decoder = BertConfig() +config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) +model = SpeechEncoderDecoderModel(config=config) + +Initialising SpeechEncoderDecoderModel from a pretrained encoder and a pretrained decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_9.txt b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..616cf7ead93956b3244913b56032cad7056caae8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech-encoder-decoder/chunk_9.txt @@ -0,0 +1 @@ +[SpeechEncoderDecoderModel] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_10.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..a094e8a048538b71f31a26cfa96c79a0e95fc23f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_10.txt @@ -0,0 +1,2 @@ +The [Speech2TextFeatureExtractor] class is responsible for extracting the log-mel filter-bank +features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_11.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..28e321d396f286fbfb8930e415a69dc1763c61c8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_11.txt @@ -0,0 +1,3 @@ +The [Speech2TextProcessor] wraps [Speech2TextFeatureExtractor] and +[Speech2TextTokenizer] into a single instance to both extract the input features and decode the +predicted token ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_12.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05a45b8de7eee14d3ef1a102595bbf4a2c377559 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_12.txt @@ -0,0 +1,2 @@ +The feature extractor depends on torchaudio and the tokenizer depends on sentencepiece so be sure to +install those packages before running the examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_13.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..f787ae51b35bcc773d556265a914be4321fedffa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_13.txt @@ -0,0 +1,2 @@ +You could either install those as extra speech dependencies with +pip install transformers"[speech, sentencepiece]" or install the packages separately with pip install torchaudio sentencepiece. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_14.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..1402bffe3fb83ee6ac6787906093c2037fafd402 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_14.txt @@ -0,0 +1 @@ +Also torchaudio requires the development version of the libsndfile package which can be installed via a system package manager. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_15.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..25193dbf53bafcda1f34b585016cad233660ccb5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_15.txt @@ -0,0 +1,23 @@ +On Ubuntu it can +be installed as follows: apt install libsndfile1-dev + +ASR and Speech Translation + +thon + +import torch +from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration +from datasets import load_dataset +model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr") +processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr") +ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") +inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt") +generated_ids = model.generate(inputs["input_features"], attention_mask=inputs["attention_mask"]) +transcription = processor.batch_decode(generated_ids, skip_special_tokens=True) +transcription +['mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'] + +Multilingual speech translation + +For multilingual speech translation models, eos_token_id is used as the decoder_start_token_id and + the target language id is forced as the first generated token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_16.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..a26916cf0038930a781cd10dd3078a196cdb612c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_16.txt @@ -0,0 +1,2 @@ +To force the target language id as the first + generated token, pass the forced_bos_token_id parameter to the generate() method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_17.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..ddefb7b87b3bba84c71e4439db0e3337d7f16f04 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_17.txt @@ -0,0 +1,3 @@ +The following + example shows how to transate English speech to French text using the facebook/s2t-medium-mustc-multilingual-st + checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_18.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c1570e3f3ac4f388039b3b97e1222bbf408a291 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_18.txt @@ -0,0 +1,17 @@ +thon + +import torch +from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration +from datasets import load_dataset +model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") +processor = Speech2TextProcessor.from_pretrained("facebook/s2t-medium-mustc-multilingual-st") +ds = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation") +inputs = processor(ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt") +generated_ids = model.generate( + inputs["input_features"], + attention_mask=inputs["attention_mask"], + forced_bos_token_id=processor.tokenizer.lang_code_to_id["fr"], + ) +translation = processor.batch_decode(generated_ids, skip_special_tokens=True) +translation +["(Vidéo) Si M. Kilder est l'apossible des classes moyennes, et nous sommes heureux d'être accueillis dans son évangile."] \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_19.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8eeb462532625ba49b073d17def422a21d837f0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_19.txt @@ -0,0 +1 @@ +See the model hub to look for Speech2Text checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_20.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb61e3fbe815f3e8b86c05f30a806f106fca653d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_20.txt @@ -0,0 +1,32 @@ +Speech2TextConfig +[[autodoc]] Speech2TextConfig +Speech2TextTokenizer +[[autodoc]] Speech2TextTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +Speech2TextFeatureExtractor +[[autodoc]] Speech2TextFeatureExtractor + - call +Speech2TextProcessor +[[autodoc]] Speech2TextProcessor + - call + - from_pretrained + - save_pretrained + - batch_decode + - decode + +Speech2TextModel +[[autodoc]] Speech2TextModel + - forward +Speech2TextForConditionalGeneration +[[autodoc]] Speech2TextForConditionalGeneration + - forward + +TFSpeech2TextModel +[[autodoc]] TFSpeech2TextModel + - call +TFSpeech2TextForConditionalGeneration +[[autodoc]] TFSpeech2TextForConditionalGeneration + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_9.txt b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b2a8fa70ce9958615c5ef45ceb44d00fbb1071c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text/chunk_9.txt @@ -0,0 +1,2 @@ +The +generate() method can be used for inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a21085356904210ddee6723c4ee37703cf42ecb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_10.txt @@ -0,0 +1,2 @@ +The [Wav2Vec2FeatureExtractor] class is responsible for preprocessing the input speech and +[Speech2Text2Tokenizer] decodes the generated target tokens to the target string. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e4156d914fdfa6fd60dbe0af0db959c55643f0a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_11.txt @@ -0,0 +1,4 @@ +The +[Speech2Text2Processor] wraps [Wav2Vec2FeatureExtractor] and +[Speech2Text2Tokenizer] into a single instance to both extract the input features and decode the +predicted token ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bdb15d2e8a32053e74dec7eb899877c9a132b0d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_12.txt @@ -0,0 +1,36 @@ +Step-by-step Speech Translation + +thon + +import torch +from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel +from datasets import load_dataset +import soundfile as sf +model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de") +processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de") +def map_to_array(batch): + speech, _ = sf.read(batch["file"]) + batch["speech"] = speech + return batch +ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +ds = ds.map(map_to_array) +inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt") +generated_ids = model.generate(inputs=inputs["input_values"], attention_mask=inputs["attention_mask"]) +transcription = processor.batch_decode(generated_ids) + +Speech Translation via Pipelines + +The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code +thon + +from datasets import load_dataset +from transformers import pipeline +librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +asr = pipeline( + "automatic-speech-recognition", + model="facebook/s2t-wav2vec2-large-en-de", + feature_extractor="facebook/s2t-wav2vec2-large-en-de", + ) +translation_de = asr(librispeech_en[0]["file"]) + +See model hub to look for Speech2Text2 checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6af8e085b5ffd2d58388c662960244fad40209e0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_13.txt @@ -0,0 +1,21 @@ +Resources + +Causal language modeling task guide + +Speech2Text2Config +[[autodoc]] Speech2Text2Config +Speech2TextTokenizer +[[autodoc]] Speech2Text2Tokenizer + - batch_decode + - decode + - save_vocabulary +Speech2Text2Processor +[[autodoc]] Speech2Text2Processor + - call + - from_pretrained + - save_pretrained + - batch_decode + - decode +Speech2Text2ForCausalLM +[[autodoc]] Speech2Text2ForCausalLM + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_4.txt b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_4.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_5.txt b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..5606ee6065165d163f7f998c026f5dd22a10bad8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_5.txt @@ -0,0 +1,3 @@ +Usage tips + +Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_6.txt b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..a75b68cd7e523a4fda0caab4fc0185926068f26e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_6.txt @@ -0,0 +1,2 @@ +For more information, see + the official models . \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_7.txt b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..6590252cc57049b3439a25caf68cbe665f8d091d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_7.txt @@ -0,0 +1 @@ +Speech2Text2 is always used within the SpeechEncoderDecoder framework. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..759aac3bd50b7d45e5eaae16dc55e4c75bcf46a6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_8.txt @@ -0,0 +1 @@ +Speech2Text2's tokenizer is based on fastBPE. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..385241add2cf0e8a99600ef6e2b156574c6d3980 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speech_to_text_2/chunk_9.txt @@ -0,0 +1,4 @@ +Inference +Speech2Text2's [SpeechEncoderDecoderModel] model accepts raw waveform input values from speech and +makes use of [~generation.GenerationMixin.generate] to translate the input speech +autoregressively to the target language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speecht5/chunk_6.txt b/chunked/content_aware_chunking/model_doc_speecht5/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c99914b443eecc52d655a4f54599ca6af01b0a0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speecht5/chunk_6.txt @@ -0,0 +1 @@ +Extensive evaluations show the superiority of the proposed SpeechT5 framework on a wide variety of spoken language processing tasks, including automatic speech recognition, speech synthesis, speech translation, voice conversion, speech enhancement, and speaker identification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speecht5/chunk_7.txt b/chunked/content_aware_chunking/model_doc_speecht5/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..23e753b0dd443084ac3559564454515cf96164e2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speecht5/chunk_7.txt @@ -0,0 +1 @@ +This model was contributed by Matthijs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speecht5/chunk_8.txt b/chunked/content_aware_chunking/model_doc_speecht5/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speecht5/chunk_8.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_speecht5/chunk_9.txt b/chunked/content_aware_chunking/model_doc_speecht5/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b274c3c516601e5e0b96d4c5068ed0c8ef5d3871 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_speecht5/chunk_9.txt @@ -0,0 +1,38 @@ +SpeechT5Config +[[autodoc]] SpeechT5Config +SpeechT5HifiGanConfig +[[autodoc]] SpeechT5HifiGanConfig +SpeechT5Tokenizer +[[autodoc]] SpeechT5Tokenizer + - call + - save_vocabulary + - decode + - batch_decode +SpeechT5FeatureExtractor +[[autodoc]] SpeechT5FeatureExtractor + - call +SpeechT5Processor +[[autodoc]] SpeechT5Processor + - call + - pad + - from_pretrained + - save_pretrained + - batch_decode + - decode +SpeechT5Model +[[autodoc]] SpeechT5Model + - forward +SpeechT5ForSpeechToText +[[autodoc]] SpeechT5ForSpeechToText + - forward +SpeechT5ForTextToSpeech +[[autodoc]] SpeechT5ForTextToSpeech + - forward + - generate +SpeechT5ForSpeechToSpeech +[[autodoc]] SpeechT5ForSpeechToSpeech + - forward + - generate_speech +SpeechT5HifiGan +[[autodoc]] SpeechT5HifiGan + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_splinter/chunk_10.txt b/chunked/content_aware_chunking/model_doc_splinter/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..b800bb9a95cfa5448c46f12f5cdb91f415c45696 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_splinter/chunk_10.txt @@ -0,0 +1,3 @@ +Usage tips + +Splinter was trained to predict answers spans conditioned on a special [QUESTION] token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_splinter/chunk_11.txt b/chunked/content_aware_chunking/model_doc_splinter/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b6457ebbc85a9b4b222b0e886967e346e49893a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_splinter/chunk_11.txt @@ -0,0 +1,2 @@ +These tokens contextualize + to question representations which are used to predict the answers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_splinter/chunk_12.txt b/chunked/content_aware_chunking/model_doc_splinter/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..7cd56ab30e88d64d9430e012d7cd03a75d09f192 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_splinter/chunk_12.txt @@ -0,0 +1,2 @@ +This layer is called QASS, and is the default + behaviour in the [SplinterForQuestionAnswering] class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_splinter/chunk_13.txt b/chunked/content_aware_chunking/model_doc_splinter/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..894a27658a33c89bc1637572b90d31443182fc75 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_splinter/chunk_13.txt @@ -0,0 +1,3 @@ +Therefore: +Use [SplinterTokenizer] (rather than [BertTokenizer]), as it already + contains this special token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_splinter/chunk_14.txt b/chunked/content_aware_chunking/model_doc_splinter/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b3ff75ff582544b06658ba220c8fd3de9f64f26 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_splinter/chunk_14.txt @@ -0,0 +1,2 @@ +Also, its default behavior is to use this token when two sequences are given (for + example, in the run_qa.py script). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_splinter/chunk_15.txt b/chunked/content_aware_chunking/model_doc_splinter/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..72216a4d38bfd86924193ca4a913dda4bf70910d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_splinter/chunk_15.txt @@ -0,0 +1,2 @@ +If you plan on using Splinter outside run_qa.py, please keep in mind the question token - it might be important for + the success of your model, especially in a few-shot setting. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_splinter/chunk_16.txt b/chunked/content_aware_chunking/model_doc_splinter/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0b9852da1199edd1f1e541984a4fab2e9912f38 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_splinter/chunk_16.txt @@ -0,0 +1 @@ +Please note there are two different checkpoints for each size of Splinter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_splinter/chunk_17.txt b/chunked/content_aware_chunking/model_doc_splinter/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..7da528df3497f98b174d6f550ee81a73826e3be7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_splinter/chunk_17.txt @@ -0,0 +1,3 @@ +Both are basically the same, except that + one also has the pretrained weights of the QASS layer (tau/splinter-base-qass and tau/splinter-large-qass) and one + doesn't (tau/splinter-base and tau/splinter-large). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_splinter/chunk_18.txt b/chunked/content_aware_chunking/model_doc_splinter/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef63a0ba3cd3332f6c1392acb2a098824af37e1b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_splinter/chunk_18.txt @@ -0,0 +1,2 @@ +This is done to support randomly initializing this layer at + fine-tuning, as it is shown to yield better results for some cases in the paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_splinter/chunk_19.txt b/chunked/content_aware_chunking/model_doc_splinter/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..b97b009ae2a0b8f97bc807f04778a3ce7b374f22 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_splinter/chunk_19.txt @@ -0,0 +1,23 @@ +Resources + +Question answering task guide + +SplinterConfig +[[autodoc]] SplinterConfig +SplinterTokenizer +[[autodoc]] SplinterTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +SplinterTokenizerFast +[[autodoc]] SplinterTokenizerFast +SplinterModel +[[autodoc]] SplinterModel + - forward +SplinterForQuestionAnswering +[[autodoc]] SplinterForQuestionAnswering + - forward +SplinterForPreTraining +[[autodoc]] SplinterForPreTraining + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_splinter/chunk_9.txt b/chunked/content_aware_chunking/model_doc_splinter/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_splinter/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..1945f904d30e0ac1991d648e77fc7c4b5013fd86 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_10.txt @@ -0,0 +1,4 @@ +We demonstrate how to replace several operations in +self-attention layers with grouped convolutions, and we use this technique in a novel network architecture called +SqueezeBERT, which runs 4.3x faster than BERT-base on the Pixel 3 while achieving competitive accuracy on the GLUE test +set. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ee00503ca655ce32806be3e29ea4cae5ac230e1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_11.txt @@ -0,0 +1 @@ +The SqueezeBERT code will be released. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f413ba54caf99cdc77b0636d822b62a74949ae1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by forresti. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d660e68f2dba47d9b87e5986d0a1cdefe8c2310f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_13.txt @@ -0,0 +1,4 @@ +Usage tips + +SqueezeBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right + rather than the left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_14.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f44d6208c4f4d6aecd2d3c38aaa654da3ab55a2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_14.txt @@ -0,0 +1 @@ +SqueezeBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_15.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..efd1565fd3e336f6554fa6ce51a152104f9a29b6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_15.txt @@ -0,0 +1,2 @@ +It is therefore + efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_16.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ca5d327cd6422d456ce83191a8376864bdb0c4f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_16.txt @@ -0,0 +1,2 @@ +Models trained + with a causal language modeling (CLM) objective are better in that regard. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_17.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..be80b86ad6d5bff2e3453c18178b4ce27b4df211 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_17.txt @@ -0,0 +1,2 @@ +For best results when finetuning on sequence classification tasks, it is recommended to start with the + squeezebert/squeezebert-mnli-headless checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_18.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..b833f759337ca27718e806306b345a5ba5ad22b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_18.txt @@ -0,0 +1,30 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +SqueezeBertConfig +[[autodoc]] SqueezeBertConfig +SqueezeBertTokenizer +[[autodoc]] SqueezeBertTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +SqueezeBertTokenizerFast +[[autodoc]] SqueezeBertTokenizerFast +SqueezeBertModel +[[autodoc]] SqueezeBertModel +SqueezeBertForMaskedLM +[[autodoc]] SqueezeBertForMaskedLM +SqueezeBertForSequenceClassification +[[autodoc]] SqueezeBertForSequenceClassification +SqueezeBertForMultipleChoice +[[autodoc]] SqueezeBertForMultipleChoice +SqueezeBertForTokenClassification +[[autodoc]] SqueezeBertForTokenClassification +SqueezeBertForQuestionAnswering +[[autodoc]] SqueezeBertForQuestionAnswering \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_7.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ece40f4884e1ccc4ae87d0a497e0a1d24f1cbe9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_7.txt @@ -0,0 +1,2 @@ +In particular, we +consider smartphones and other mobile devices as crucial platforms for deploying NLP models at scale. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_8.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..0008ac81d9474a67ed0625e596d3b2731309d880 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_8.txt @@ -0,0 +1,3 @@ +However, today's +highly-accurate NLP neural network models such as BERT and RoBERTa are extremely computationally expensive, with +BERT-base taking 1.7 seconds to classify a text snippet on a Pixel 3 smartphone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_squeezebert/chunk_9.txt b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..9160a834f5eba85d01efefd39b022b0ef7417fad --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_squeezebert/chunk_9.txt @@ -0,0 +1,3 @@ +In this work, we observe that methods +such as grouped convolutions have yielded significant speedups for computer vision networks, but many of these +techniques have not been adopted by NLP neural network designers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_stablelm/chunk_10.txt b/chunked/content_aware_chunking/model_doc_stablelm/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e8672e814e2834a3361f2df38efcf7b88820449 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_stablelm/chunk_10.txt @@ -0,0 +1 @@ +torch.bfloat16). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_stablelm/chunk_11.txt b/chunked/content_aware_chunking/model_doc_stablelm/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..470f2b119d467798ef723197e8e06d86b7ed0d5b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_stablelm/chunk_11.txt @@ -0,0 +1,26 @@ +Now, to run the model with Flash Attention 2, refer to the snippet below: +thon + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +device = "cuda" # the device to load the model onto +tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t") +model = AutoModelForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2") +model.to(device) +model_inputs = tokenizer("The weather is always wonderful in", return_tensors="pt").to(model.device) +generated_ids = model.generate(**model_inputs, max_length=32, do_sample=True) +responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) +responses +['The weather is always wonderful in Santa Barbara and, for visitors hoping to make the move to our beautiful seaside city, this town offers plenty of great places to'] + +StableLmConfig +[[autodoc]] StableLmConfig +StableLmModel +[[autodoc]] StableLmModel + - forward +StableLmForCausalLM +[[autodoc]] StableLmForCausalLM + - forward +StableLmForSequenceClassification +[[autodoc]] StableLmForSequenceClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_stablelm/chunk_4.txt b/chunked/content_aware_chunking/model_doc_stablelm/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8b28825f027706056e2891f0d4304ba1d5faa8b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_stablelm/chunk_4.txt @@ -0,0 +1,3 @@ +Usage Tips + +The architecture is similar to LLaMA but with RoPE applied to 25% of head embedding dimensions, LayerNorm instead of RMSNorm, and optional QKV bias terms. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_stablelm/chunk_5.txt b/chunked/content_aware_chunking/model_doc_stablelm/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..285fe0d3c9f35eef29191948dfcf5a5e53fa677c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_stablelm/chunk_5.txt @@ -0,0 +1 @@ +StableLM 3B 4E1T-based models uses the same tokenizer as [GPTNeoXTokenizerFast]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_stablelm/chunk_6.txt b/chunked/content_aware_chunking/model_doc_stablelm/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb8927fb3b9f290724ad50e71324615e9954f137 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_stablelm/chunk_6.txt @@ -0,0 +1,17 @@ +StableLM 3B 4E1T and StableLM Zephyr 3B can be found on the Huggingface Hub +The following code snippet demonstrates how to use StableLM 3B 4E1T for inference: +thon + +from transformers import AutoModelForCausalLM, AutoTokenizer +device = "cuda" # the device to load the model onto +tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t") +model = AutoModelForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t") +model.to(device) +model_inputs = tokenizer("The weather is always wonderful in", return_tensors="pt").to(model.device) +generated_ids = model.generate(**model_inputs, max_length=32, do_sample=True) +responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) +responses +['The weather is always wonderful in Santa Barbara and, for visitors hoping to make the move to our beautiful seaside city, this town offers plenty of great places to'] + +Combining StableLM and Flash Attention 2 +First, make sure to install the latest version of Flash Attention v2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_stablelm/chunk_7.txt b/chunked/content_aware_chunking/model_doc_stablelm/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..400d5e2dd0ae7bc4143f0801579100a29a8a97dc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_stablelm/chunk_7.txt @@ -0,0 +1,2 @@ +pip install -U flash-attn --no-build-isolation +Also make sure that your hardware is compatible with Flash-Attention 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_stablelm/chunk_8.txt b/chunked/content_aware_chunking/model_doc_stablelm/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4b04cd89354466c3206845cd0a2485e0da3833d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_stablelm/chunk_8.txt @@ -0,0 +1 @@ +Read more about it in the official documentation of the flash-attn repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_stablelm/chunk_9.txt b/chunked/content_aware_chunking/model_doc_stablelm/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6f867c10e3ad2beb776e09fc1bf5b06fcc997cc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_stablelm/chunk_9.txt @@ -0,0 +1 @@ +Note: you must load your model in half-precision (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swiftformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fc043741b96f552a70515a7cb8128bff4cfa75b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_10.txt @@ -0,0 +1 @@ +Using our proposed efficient additive attention, we build a series of models called "SwiftFormer" which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swiftformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1768ad34ef39855f93a7379bc135cbd6c4abf38 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_11.txt @@ -0,0 +1 @@ +Our small variant achieves 78.5% top-1 ImageNet-1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster compared to MobileViT-v2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swiftformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1f5abf1b531dbc8d7b8453fa41953dbdcde0cd6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by shehan97. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swiftformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swiftformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3595d866a846f183df92d13a820a8969a9ea0f0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_14.txt @@ -0,0 +1,8 @@ +SwiftFormerConfig +[[autodoc]] SwiftFormerConfig +SwiftFormerModel +[[autodoc]] SwiftFormerModel + - forward +SwiftFormerForImageClassification +[[autodoc]] SwiftFormerForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swiftformer/chunk_6.txt b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4819702dadd858c4670192134f6dc2fa0e7ece7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_6.txt @@ -0,0 +1 @@ +Although hybrid approaches have been proposed to combine the advantages of convolutions and self-attention for a better speed-accuracy trade-off, the expensive matrix multiplication operations in self-attention remain a bottleneck. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swiftformer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..f274955f47f1a67b0ed0c6a7d1b781680aa40e51 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_7.txt @@ -0,0 +1 @@ +In this work, we introduce a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations with linear element-wise multiplications. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swiftformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c6169bdffd882b61dbaf3ac764e40d841a0d7fd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_8.txt @@ -0,0 +1 @@ +Our design shows that the key-value interaction can be replaced with a linear layer without sacrificing any accuracy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swiftformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..91a6ec6f7bd99607226229ba0e588bd15f98ea31 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swiftformer/chunk_9.txt @@ -0,0 +1 @@ +Unlike previous state-of-the-art methods, our efficient formulation of self-attention enables its usage at all stages of the network. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_10.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_10.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_11.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe063d0f82bdd9b95cdc2cee77e4423709b2ab51 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by novice03. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_12.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..46ded1d1718fdbce746416981bf93a492a469325 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_12.txt @@ -0,0 +1 @@ +The Tensorflow version of this model was contributed by amyeroberts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_13.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_14.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f67ecae2bea70947821b4cdbacdf0926c1d3a18 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_14.txt @@ -0,0 +1,3 @@ +Usage tips + +Swin pads the inputs supporting any input height and width (if divisible by 32). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_15.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..341da485da8111e94f53d40e294554ee69849a3a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_15.txt @@ -0,0 +1 @@ +Swin can be used as a backbone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_16.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fc1f01024443959af1c42054e3541eb5ae75aef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_16.txt @@ -0,0 +1 @@ +When output_hidden_states = True, it will output both hidden_states and reshaped_hidden_states. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_17.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..51ab63c196d5049125a8c3d07124db966522ff4f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_17.txt @@ -0,0 +1 @@ +The reshaped_hidden_states have a shape of (batch, num_channels, height, width) rather than (batch_size, sequence_length, num_channels). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_18.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2fae0e0c98bbc63a0e21ae0279a5cff1baccdc8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_18.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Swin Transformer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_19.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ecbbbe59a1a6f5f39ca0e36a907b7fef08407b3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_19.txt @@ -0,0 +1 @@ +[SwinForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_20.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..17bd08e15991bdcae2d2061a1e74ee16ca3b7f47 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_20.txt @@ -0,0 +1,5 @@ +See also: Image classification task guide + +Besides that: + +[SwinForMaskedImageModeling] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_21.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_21.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_22.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_22.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_23.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a56b8332e4bd9d1f422b25c3ed8d7fa48abaca3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_23.txt @@ -0,0 +1,22 @@ +SwinConfig +[[autodoc]] SwinConfig + +SwinModel +[[autodoc]] SwinModel + - forward +SwinForMaskedImageModeling +[[autodoc]] SwinForMaskedImageModeling + - forward +SwinForImageClassification +[[autodoc]] transformers.SwinForImageClassification + - forward + +TFSwinModel +[[autodoc]] TFSwinModel + - call +TFSwinForMaskedImageModeling +[[autodoc]] TFSwinForMaskedImageModeling + - call +TFSwinForImageClassification +[[autodoc]] transformers.TFSwinForImageClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_7.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8d73e765063d418eb2df0b7da68dc05572764ee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_7.txt @@ -0,0 +1,2 @@ +Its performance surpasses the previous state-of-the-art by a large margin of +2.7 box AP and ++2.6 mask AP on COCO, and +3.2 mIoU on ADE20K, demonstrating the potential of Transformer-based models as vision backbones. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_8.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cd642d34565ab0258849cc4018366e5a8dacee6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_8.txt @@ -0,0 +1 @@ +The hierarchical design and the shifted window approach also prove beneficial for all-MLP architectures. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin/chunk_9.txt b/chunked/content_aware_chunking/model_doc_swin/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..77deec0734ff0f9b6a1ccb30621e4478278567fd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin/chunk_9.txt @@ -0,0 +1 @@ +Swin Transformer architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin2sr/chunk_10.txt b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7f8f0f75bd7d10f7461a990f2e0f14537f88240 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_10.txt @@ -0,0 +1 @@ +Swin2SR architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin2sr/chunk_11.txt b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_11.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin2sr/chunk_12.txt b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin2sr/chunk_13.txt b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_13.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin2sr/chunk_14.txt b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae76be8d9a5b2f287fd731ea16103f9c1f374701 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_14.txt @@ -0,0 +1,2 @@ +Resources +Demo notebooks for Swin2SR can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin2sr/chunk_15.txt b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2d7d5cf09db19cd78c3774601543a014e1febe8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_15.txt @@ -0,0 +1 @@ +A demo Space for image super-resolution with SwinSR can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin2sr/chunk_16.txt b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1632d741e1e2052e861e5e122825bceb51f7f52 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_16.txt @@ -0,0 +1,11 @@ +Swin2SRImageProcessor +[[autodoc]] Swin2SRImageProcessor + - preprocess +Swin2SRConfig +[[autodoc]] Swin2SRConfig +Swin2SRModel +[[autodoc]] Swin2SRModel + - forward +Swin2SRForImageSuperResolution +[[autodoc]] Swin2SRForImageSuperResolution + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin2sr/chunk_7.txt b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8876850ec29fcbf6908c1ef2609e945e623f221 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_7.txt @@ -0,0 +1 @@ +Using this method we can tackle the major issues in training transformer vision models, such as training instability, resolution gaps between pre-training and fine-tuning, and hunger on data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin2sr/chunk_8.txt b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b91b7f09007c4abebb701ee0b796b2ac761281c3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_8.txt @@ -0,0 +1 @@ +We conduct experiments on three representative tasks: JPEG compression artifacts removal, image super-resolution (classical and lightweight), and compressed image super-resolution. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swin2sr/chunk_9.txt b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bb0c532f537c4f013b988519a7c382d09fd1c3d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swin2sr/chunk_9.txt @@ -0,0 +1 @@ +Experimental results demonstrate that our method, Swin2SR, can improve the training convergence and performance of SwinIR, and is a top-5 solution at the "AIM 2022 Challenge on Super-Resolution of Compressed Image and Video". \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ea614223cd32736861c6e0ea78008c525b779dc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_11.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Swin Transformer v2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ca532b2e8255ddf865c053ac62b8599bcd10f6e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_12.txt @@ -0,0 +1 @@ +[Swinv2ForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..1cf596824cb99f0b5a7eb444f535eb3beb99e563 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_13.txt @@ -0,0 +1,5 @@ +See also: Image classification task guide + +Besides that: + +[Swinv2ForMaskedImageModeling] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_14.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_15.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b9c59a8768cc57f96e645fec998822a5dd024eb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_16.txt @@ -0,0 +1,11 @@ +Swinv2Config +[[autodoc]] Swinv2Config +Swinv2Model +[[autodoc]] Swinv2Model + - forward +Swinv2ForMaskedImageModeling +[[autodoc]] Swinv2ForMaskedImageModeling + - forward +Swinv2ForImageClassification +[[autodoc]] transformers.Swinv2ForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_5.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7cd013071ba98e2e2b36cd988f6f7d1d3d5c687 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_5.txt @@ -0,0 +1 @@ +Three main techniques are proposed: 1) a residual-post-norm method combined with cosine attention to improve training stability; 2) A log-spaced continuous position bias method to effectively transfer models pre-trained using low-resolution images to downstream tasks with high-resolution inputs; 3) A self-supervised pre-training method, SimMIM, to reduce the needs of vast labeled images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_6.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d6168652f659e7f5748adb564dc616cfe1977d3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_6.txt @@ -0,0 +1 @@ +Through these techniques, this paper successfully trained a 3 billion-parameter Swin Transformer V2 model, which is the largest dense vision model to date, and makes it capable of training with images of up to 1,536×1,536 resolution. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_7.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..7da0a170dbeb5991dc526525df93322a95f2b453 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_7.txt @@ -0,0 +1 @@ +It set new performance records on 4 representative vision tasks, including ImageNet-V2 image classification, COCO object detection, ADE20K semantic segmentation, and Kinetics-400 video action classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b147d6624431741c8733ee6fdb1a396291ed454 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_8.txt @@ -0,0 +1 @@ +Also note our training is much more efficient than that in Google's billion-level visual models, which consumes 40 times less labelled data and 40 times less training time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_swinv2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_swinv2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..320c1b50a33bb48e570f9fc2289505509bac3173 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_swinv2/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by nandwalritik. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_10.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c12f5872d9e542216a2b9a636a81e27129c8fd48 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_10.txt @@ -0,0 +1 @@ +We simplify the MoE routing algorithm and design intuitive improved models with reduced communication and computational costs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_11.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..833b1c743d95b295fd153139b46333837b4a3d9b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_11.txt @@ -0,0 +1 @@ +Our proposed training techniques help wrangle the instabilities and we show large sparse models may be trained, for the first time, with lower precision (bfloat16) formats. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_12.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ca468bb1b431333c9dd45f21251c43559d4a45c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_12.txt @@ -0,0 +1 @@ +We design models based off T5-Base and T5-Large to obtain up to 7x increases in pre-training speed with the same computational resources. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_13.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..fce194fa0d072797b1004d2b2db6b2aa97d7ebc1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_13.txt @@ -0,0 +1 @@ +These improvements extend into multilingual settings where we measure gains over the mT5-Base version across all 101 languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_14.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6bbfdc3d19af3841847afbc4124e6a666b6eba3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_14.txt @@ -0,0 +1 @@ +Finally, we advance the current scale of language models by pre-training up to trillion parameter models on the "Colossal Clean Crawled Corpus" and achieve a 4x speedup over the T5-XXL model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_15.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..2586020e0dd3afccae1e8e6299e7956c2f726b06 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_15.txt @@ -0,0 +1 @@ +This model was contributed by Younes Belkada and Arthur Zucker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_16.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_16.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_17.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bc3753f42b2356534f43315a81c92c56a7303af --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_17.txt @@ -0,0 +1,3 @@ +Usage tips + +SwitchTransformers uses the [T5Tokenizer], which can be loaded directly from each model's repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_18.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff5efe36ed1cfecc26f6c7c5fb723e20303a099b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_18.txt @@ -0,0 +1 @@ +The released weights are pretrained on English Masked Language Modeling task, and should be finetuned. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_19.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e42df6aac5ca8cd7ef47e62aad387e1be71731f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_19.txt @@ -0,0 +1,23 @@ +Resources + +Translation task guide +Summarization task guide + +SwitchTransformersConfig +[[autodoc]] SwitchTransformersConfig +SwitchTransformersTop1Router +[[autodoc]] SwitchTransformersTop1Router + - _compute_router_probabilities + - forward +SwitchTransformersSparseMLP +[[autodoc]] SwitchTransformersSparseMLP + - forward +SwitchTransformersModel +[[autodoc]] SwitchTransformersModel + - forward +SwitchTransformersForConditionalGeneration +[[autodoc]] SwitchTransformersForConditionalGeneration + - forward +SwitchTransformersEncoderModel +[[autodoc]] SwitchTransformersEncoderModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_6.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..90717c099c0ba15e5bbb931175f8b133423ca4db --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_6.txt @@ -0,0 +1,2 @@ +The abstract from the paper is the following: +In deep learning, models typically reuse the same parameters for all inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_7.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..973aeff2c8174baedb0751c67355ac8e2c73d6ba --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_7.txt @@ -0,0 +1 @@ +Mixture of Experts (MoE) defies this and instead selects different parameters for each incoming example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_8.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6907545b2bdef0bdaafad9048f0780b4c911c0c6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_8.txt @@ -0,0 +1 @@ +The result is a sparsely-activated model -- with outrageous numbers of parameters -- but a constant computational cost. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_9.txt b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a1667747e7e65d5d4ac4e837c9218c6ed341e5c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_switch_transformers/chunk_9.txt @@ -0,0 +1 @@ +However, despite several notable successes of MoE, widespread adoption has been hindered by complexity, communication costs and training instability -- we address these with the Switch Transformer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_100.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..8debe4cbd419607450e9514e3058239260a1ba3b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_100.txt @@ -0,0 +1 @@ +[TFT5ForConditionalGeneration] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_101.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..66e8f35c9a601d987d95b20c0e8515074d41fdc5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_101.txt @@ -0,0 +1 @@ +[FlaxT5ForConditionalGeneration] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_102.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8483011ba0024ea5e57bc7e383bed0dbe42eb97 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_102.txt @@ -0,0 +1 @@ +Summarization chapter of the 🤗 Hugging Face course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_103.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..197b6e1079191032d33cf31674fcac2ee99e5780 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_103.txt @@ -0,0 +1,3 @@ +Summarization task guide + +[FlaxT5ForConditionalGeneration] is supported by this example script for training T5 with a span-masked language model objective. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_104.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..8643198a47f2d1dbb30aaac985c8ef59246e937c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_104.txt @@ -0,0 +1 @@ +The script also shows how to train a T5 tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_105.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..86ce55eb2eb95ffbcc5c1cae6d02987e26e4a953 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_105.txt @@ -0,0 +1 @@ +[FlaxT5ForConditionalGeneration] is also supported by this notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_106.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3311968c73eb1f0ba8ade1bb0f2ce4ba1a8f4d5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_106.txt @@ -0,0 +1 @@ +[T5ForConditionalGeneration] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_107.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..8debe4cbd419607450e9514e3058239260a1ba3b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_107.txt @@ -0,0 +1 @@ +[TFT5ForConditionalGeneration] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_108.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..efb906417e33ace851278224880d53e5962be033 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_108.txt @@ -0,0 +1,3 @@ +Translation task guide + +A notebook on how to finetune T5 for question answering with TensorFlow 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_109.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..2934a0f906282250c446c70531ea36607659bdfd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_109.txt @@ -0,0 +1,2 @@ +🌎 +A notebook on how to finetune T5 for question answering on a TPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_110.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..53f238738045fb79e6d18ce453090e05dd9ca24a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_110.txt @@ -0,0 +1,2 @@ +🚀 Deploy +- A blog post on how to deploy T5 11B for inference for less than $500. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_111.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..5558d54bac5a780d0e7e9a1cb6daeb4d09478289 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_111.txt @@ -0,0 +1,53 @@ +T5Config +[[autodoc]] T5Config +T5Tokenizer +[[autodoc]] T5Tokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +T5TokenizerFast +[[autodoc]] T5TokenizerFast + +T5Model +[[autodoc]] T5Model + - forward +T5ForConditionalGeneration +[[autodoc]] T5ForConditionalGeneration + - forward +T5EncoderModel +[[autodoc]] T5EncoderModel + - forward +T5ForSequenceClassification +[[autodoc]] T5ForSequenceClassification + - forward +T5ForTokenClassification +[[autodoc]] T5ForTokenClassification + - forward +T5ForQuestionAnswering +[[autodoc]] T5ForQuestionAnswering + - forward + +TFT5Model +[[autodoc]] TFT5Model + - call +TFT5ForConditionalGeneration +[[autodoc]] TFT5ForConditionalGeneration + - call +TFT5EncoderModel +[[autodoc]] TFT5EncoderModel + - call + +FlaxT5Model +[[autodoc]] FlaxT5Model + - call + - encode + - decode +FlaxT5ForConditionalGeneration +[[autodoc]] FlaxT5ForConditionalGeneration + - call + - encode + - decode +FlaxT5EncoderModel +[[autodoc]] FlaxT5EncoderModel + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_31.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..6182dcc50960fc2952b4d5b2c4e20c66f7d7be92 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_31.txt @@ -0,0 +1,2 @@ +Refer to + the documentation of mT5 which can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_32.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..18a35b6486c7bb303f76d77a615e3f776fb955ed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_32.txt @@ -0,0 +1,2 @@ +Training +T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_33.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..edc3e1514bb220b3ff701781521598cc3b3886b2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_33.txt @@ -0,0 +1,2 @@ +It is trained using teacher +forcing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_34.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..3caa1d9319f110515a7b876eb2ab3830043905e0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_34.txt @@ -0,0 +1 @@ +This means that for training, we always need an input sequence and a corresponding target sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_35.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dd38c1ab65bea1d7a3315a6722a16a12f19740f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_35.txt @@ -0,0 +1,2 @@ +The input +sequence is fed to the model using input_ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_36.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..4db0a7af1588bab4740fbb215a934ac8def05215 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_36.txt @@ -0,0 +1,2 @@ +The target sequence is shifted to the right, i.e., prepended by a +start-sequence token and fed to the decoder using the decoder_input_ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_37.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..a053626d9bc9ea21b0ad9cf6372deb5003b9e01b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_37.txt @@ -0,0 +1,2 @@ +In teacher-forcing style, the target +sequence is then appended by the EOS token and corresponds to the labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_38.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c70262e2dbfb056e93bf31ca7a9ceba58ef7357 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_38.txt @@ -0,0 +1,2 @@ +The PAD token is hereby used as the +start-sequence token. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_39.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d15d25777393fa7ead7efc0024435197c9a0736 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_39.txt @@ -0,0 +1 @@ +T5 can be trained / fine-tuned both in a supervised and unsupervised fashion. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_40.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..2153ac6460da5285f869baacaf3565303bdcc3a3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_40.txt @@ -0,0 +1,2 @@ +One can use [T5ForConditionalGeneration] (or the Tensorflow/Flax variant), which includes the +language modeling head on top of the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_41.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b20d69530932d42dd1438eacae186eec2660255 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_41.txt @@ -0,0 +1,4 @@ +Unsupervised denoising training + +In this setup, spans of the input sequence are masked by so-called sentinel tokens (a.k.a unique mask tokens) and +the output sequence is formed as a concatenation of the same sentinel tokens and the real masked tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_42.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..7510dc6b77d117b142ad08b57460d36b7a89c298 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_42.txt @@ -0,0 +1,3 @@ +Each +sentinel token represents a unique mask token for this sentence and should start with , +, up to . \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_43.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ae86ee5b8a2c3511932abb6e34e1459ca047c3e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_43.txt @@ -0,0 +1,2 @@ +As a default, 100 sentinel tokens are available in +[T5Tokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_44.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..6bcddf47b5d0c82157d730f3508a9803e1010514 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_44.txt @@ -0,0 +1,16 @@ +For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be +processed as follows: +thon + +from transformers import T5Tokenizer, T5ForConditionalGeneration +tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small") +model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small") +input_ids = tokenizer("The walks in park", return_tensors="pt").input_ids +labels = tokenizer(" cute dog the ", return_tensors="pt").input_ids +the forward function automatically creates the correct decoder_input_ids +loss = model(input_ids=input_ids, labels=labels).loss +loss.item() +3.7837 + +If you're interested in pre-training T5 on a new corpus, check out the run_t5_mlm_flax.py script in the Examples +directory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_45.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7194671549f6d27f151bd56d85f29b5b08c449d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_45.txt @@ -0,0 +1,3 @@ +Supervised training + +In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_46.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb7f723cf1c3de2dbc6b4111f7041a1012c89d4f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_46.txt @@ -0,0 +1,2 @@ +Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input +sequence "The house is wonderful." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_47.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..daff1412ade557fa7d6c740f1ed718cf6fab3352 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_47.txt @@ -0,0 +1 @@ +and output sequence "Das Haus ist wunderbar. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_48.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b27a3841ac560819ef75f383b0afe941f5ec548 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_48.txt @@ -0,0 +1,8 @@ +", then they should be prepared for +the model as follows: +thon + +from transformers import T5Tokenizer, T5ForConditionalGeneration +tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small") +model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small") +input_ids = tokenizer("translate English to German: The house is wonderful. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_49.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8fba50a46a956d57cdafca8ea17ae37b5a2bd30 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_49.txt @@ -0,0 +1,2 @@ +", return_tensors="pt").input_ids +labels = tokenizer("Das Haus ist wunderbar. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_50.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..8612ab1d9d95c0b22c329d7c6d73cdf71d40a912 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_50.txt @@ -0,0 +1,9 @@ +", return_tensors="pt").input_ids +the forward function automatically creates the correct decoder_input_ids +loss = model(input_ids=input_ids, labels=labels).loss +loss.item() +0.2542 + +As you can see, only 2 inputs are required for the model in order to compute a loss: input_ids (which are the +input_ids of the encoded input sequence) and labels (which are the input_ids of the encoded +target sequence). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_51.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdfd380edeb9d501c8a98cb6ebc2727c3c5101b2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_51.txt @@ -0,0 +1,3 @@ +The model will automatically create the decoder_input_ids based on the labels, by +shifting them one position to the right and prepending the config.decoder_start_token_id, which for T5 is +equal to 0 (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_52.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..4503de741fd0009bddbd37aac67e4edf80c663ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_52.txt @@ -0,0 +1 @@ +the id of the pad token). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_53.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d453f1cd27f56283b32fc2bb82981e00e08cd72 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_53.txt @@ -0,0 +1,2 @@ +Also note the task prefix: we prepend the input sequence with 'translate +English to German: ' before encoding it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_54.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..632b291cec7277a70c3f0252199f47aab44dbb55 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_54.txt @@ -0,0 +1,2 @@ +This will help in improving the performance, as this task prefix was used +during T5's pre-training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_55.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..68984bf12bebf1bcc761bacf8580d55e0831f2b9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_55.txt @@ -0,0 +1 @@ +However, the example above only shows a single training example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_56.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..5bcd667bacc94e9a1de23ea51fad13755da0e0e5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_56.txt @@ -0,0 +1,2 @@ +In practice, one trains deep learning models in +batches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_57.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8f5a0195d87f211257c5cea6490f125d3c6ec02 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_57.txt @@ -0,0 +1 @@ +This entails that we must pad/truncate examples to the same length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_58.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..efeeeaa81bddc50e0cd855ab2600a8f959b7be14 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_58.txt @@ -0,0 +1,3 @@ +For encoder-decoder models, one +typically defines a max_source_length and max_target_length, which determine the maximum length of the +input and output sequences respectively (otherwise they are truncated). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_59.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..654f1531260359d9654c0ede7d2689078fd90f63 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_59.txt @@ -0,0 +1,2 @@ +These should be carefully set depending on +the task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_60.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..f483f2941bfeb57988b7837bbe11895ab53be276 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_60.txt @@ -0,0 +1,2 @@ +In addition, we must make sure that padding token id's of the labels are not taken into account by the loss +function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_61.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bee031c7b5674588d7b1fc4da787c457c75e570 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_61.txt @@ -0,0 +1,2 @@ +In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the ignore_index +of the CrossEntropyLoss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_62.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..6623d2b422fdd5965092bfeb1c7367499aad2d95 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_62.txt @@ -0,0 +1,2 @@ +In Flax, one can use the decoder_attention_mask to ignore padded tokens from +the loss (see the Flax summarization script for details). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_63.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad15c1c734120c63d00a55e595d687b48c09717b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_63.txt @@ -0,0 +1,3 @@ +We also pass +attention_mask as additional input to the model, which makes sure that padding tokens of the inputs are +ignored. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_64.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..1375c2193c6f88f384c6148213f8d02f6595b709 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_64.txt @@ -0,0 +1 @@ +The code example below illustrates all of this. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_65.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..32da15f341392c51e17ffc1d28f4ab5d9c43c5d1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_65.txt @@ -0,0 +1,45 @@ +thon + +from transformers import T5Tokenizer, T5ForConditionalGeneration +import torch +tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small") +model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small") +the following 2 hyperparameters are task-specific +max_source_length = 512 +max_target_length = 128 +Suppose we have the following 2 training examples: +input_sequence_1 = "Welcome to NYC" +output_sequence_1 = "Bienvenue à NYC" +input_sequence_2 = "HuggingFace is a company" +output_sequence_2 = "HuggingFace est une entreprise" +encode the inputs +task_prefix = "translate English to French: " +input_sequences = [input_sequence_1, input_sequence_2] +encoding = tokenizer( + [task_prefix + sequence for sequence in input_sequences], + padding="longest", + max_length=max_source_length, + truncation=True, + return_tensors="pt", + ) +input_ids, attention_mask = encoding.input_ids, encoding.attention_mask +encode the targets +target_encoding = tokenizer( + [output_sequence_1, output_sequence_2], + padding="longest", + max_length=max_target_length, + truncation=True, + return_tensors="pt", + ) +labels = target_encoding.input_ids +replace padding token id's of the labels by -100 so it's ignored by the loss +labels[labels == tokenizer.pad_token_id] = -100 +forward pass +loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss +loss.item() +0.188 + +Additional training tips: + +T5 models need a slightly higher learning rate than the default one set in the Trainer when using the AdamW +optimizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_66.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3681cf0b68273782ed760e28696124ba1e806fb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_66.txt @@ -0,0 +1,2 @@ +Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question +answering, question generation). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_67.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..d774dac62ee3a8aba61827b2585685ab04e36995 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_67.txt @@ -0,0 +1 @@ +Note that T5 was pre-trained using the AdaFactor optimizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_68.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..f34ed96cbb5dc03d5be1d1839399bd68a6fc639b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_68.txt @@ -0,0 +1,4 @@ +According to this forum post, task prefixes matter when +(1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's +pre-training mixture (see Appendix D of the paper for the task prefixes +used). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_69.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b8d03f9ee4e0ee786de45bf8c9dd96e4b51d26b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_69.txt @@ -0,0 +1,2 @@ +If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of +pad_to_multiple_of to have a small number of predefined bucket sizes to fit all examples in. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_70.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..108fd7e619eee0fc2cccf638039373b6bf2ffccf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_70.txt @@ -0,0 +1,3 @@ +Dynamically padding +batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is +encountered during training thus significantly slowing down the training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_71.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..81175839ee033c273df73559035db7947ddc3938 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_71.txt @@ -0,0 +1,2 @@ +only padding up to the longest example in a +batch) leads to very slow training on TPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_72.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9f703ce07701a84037afd5eac4abbf84d9b9553 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_72.txt @@ -0,0 +1,2 @@ +Inference +At inference time, it is recommended to use [~generation.GenerationMixin.generate]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_73.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..e017cd5582b90a287f0a8d102a54c6c26305ea00 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_73.txt @@ -0,0 +1,3 @@ +This +method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder +and auto-regressively generates the decoder output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_74.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ee937d4514b3bc4c61e0c1f2bec062f4b84eabf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_74.txt @@ -0,0 +1 @@ +Check out this blog post to know all the details about generating text with Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_75.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..170d59d30c23c8325e51c4cb6b818aa250c06453 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_75.txt @@ -0,0 +1,2 @@ +There's also this blog post which explains how +generation works in general in encoder-decoder models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_76.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..4de89cb6f0d9354da9c37a9c544a0c0e33f76d10 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_76.txt @@ -0,0 +1,6 @@ +thon + +from transformers import T5Tokenizer, T5ForConditionalGeneration +tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small") +model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small") +input_ids = tokenizer("translate English to German: The house is wonderful. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_77.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..db2ecba9aa4b3ca0045650933aa48746a65e4a61 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_77.txt @@ -0,0 +1,4 @@ +", return_tensors="pt").input_ids +outputs = model.generate(input_ids) +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +Das Haus ist wunderbar. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_78.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..cab7a419808af3aab40d820d28274fd502a5b881 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_78.txt @@ -0,0 +1,2 @@ +Note that T5 uses the pad_token_id as the decoder_start_token_id, so when doing generation without using +[~generation.GenerationMixin.generate], make sure you start it with the pad_token_id. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_79.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..38af84fe16bf43f5bdf5c9d99e7bbd2a14008c75 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_79.txt @@ -0,0 +1 @@ +The example above only shows a single example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_80.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..7da2a86867c06b847acb8eb85b1cfe41d2330d83 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_80.txt @@ -0,0 +1,9 @@ +You can also do batched inference, like so: +thon + +from transformers import T5Tokenizer, T5ForConditionalGeneration +tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small") +model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small") +task_prefix = "translate English to German: " +use different length sentences to test batching +sentences = ["The house is wonderful. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_81.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..911fdef194375bcf28bbb39d4c791dbabb96d5c8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_81.txt @@ -0,0 +1 @@ +", "I like to work in NYC."] \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_82.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c379c7ed1bf2ecbbb1b8d7b7c34f228158dd83e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_82.txt @@ -0,0 +1,8 @@ +inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True) +output_sequences = model.generate( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + do_sample=False, # disable sampling to test if batching affects output + ) +print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True)) +['Das Haus ist wunderbar. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_83.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fa65fd78312fd4709c801fae45224e56646e42a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_83.txt @@ -0,0 +1 @@ +', 'Ich arbeite gerne in NYC.'] \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_84.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce296ba0b24442b696bfcbb022f418a8d8606059 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_84.txt @@ -0,0 +1,2 @@ +Because T5 has been trained with the span-mask denoising objective, +it can be used to predict the sentinel (masked-out) tokens during inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_85.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbd6916374d5b3ff5057fb14285da813236099a5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_85.txt @@ -0,0 +1 @@ +The predicted tokens will then be placed between the sentinel tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_86.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..58102bef9901eb86f53ca6426d42b7ea7fc83aee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_86.txt @@ -0,0 +1,10 @@ +thon + +from transformers import T5Tokenizer, T5ForConditionalGeneration +tokenizer = T5Tokenizer.from_pretrained("google-t5/t5-small") +model = T5ForConditionalGeneration.from_pretrained("google-t5/t5-small") +input_ids = tokenizer("The walks in park", return_tensors="pt").input_ids +sequence_ids = model.generate(input_ids) +sequences = tokenizer.batch_decode(sequence_ids) +sequences +[' park offers the park.'] \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_87.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..2233a94310401832336a0633e12433e7629eaad8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_87.txt @@ -0,0 +1,2 @@ +Performance +If you'd like a faster training and inference performance, install NVIDIA APEX for NVIDIA GPUs, or ROCm APEX for AMD GPUs and then the model will automatically use apex.normalization.FusedRMSNorm instead of T5LayerNorm. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_88.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..0974d0452590e23467e1a9cfbe4b1246b110f5ee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_88.txt @@ -0,0 +1 @@ +The former uses an optimized fused kernel which is several times faster than the latter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_89.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fa08a2c68e1b484e97db9a6bd5abb64d1dbfdab --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_89.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with T5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_90.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_90.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_91.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_91.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_92.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d822a973f5b2c21264e0f81bc86148fe3d8f79c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_92.txt @@ -0,0 +1 @@ +A notebook for how to finetune T5 for classification and multiple choice. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_93.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..adec1b1089123a4051d918f8dc833bf65609ad28 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_93.txt @@ -0,0 +1 @@ +A notebook for how to finetune T5 for sentiment span extraction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_94.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..28af9768145873e01d1e706ff36206db198acfdd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_94.txt @@ -0,0 +1,3 @@ +🌎 + +A notebook for how to finetune T5 for named entity recognition. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_95.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..a21bfba72a6fb73dbab7ed084ebd5b3dfb7aa7ef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_95.txt @@ -0,0 +1,3 @@ +🌎 + +A notebook for Finetuning CodeT5 for generating docstrings from Ruby code. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_96.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..acf35e94806850b77b9fbf3ea631db90bd71db52 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_96.txt @@ -0,0 +1 @@ +A notebook to Finetune T5-base-dutch to perform Dutch abstractive summarization on a TPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_97.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..9010163f414a825a6f7ed03f8e8847ce94700a4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_97.txt @@ -0,0 +1 @@ +A notebook for how to finetune T5 for summarization in PyTorch and track experiments with WandB. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_98.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..91d951b4ca395d6222a7f83235ee483cfb094e4d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_98.txt @@ -0,0 +1,2 @@ +🌎 +A blog post on Distributed Training: Train BART/T5 for Summarization using 🤗 Transformers and Amazon SageMaker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5/chunk_99.txt b/chunked/content_aware_chunking/model_doc_t5/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3311968c73eb1f0ba8ade1bb0f2ce4ba1a8f4d5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5/chunk_99.txt @@ -0,0 +1 @@ +[T5ForConditionalGeneration] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_10.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f532d3fb79668e6dc9b0f206ecfde31df52e8cf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_10.txt @@ -0,0 +1 @@ +"xl" and "xxl" replace "3B" and "11B". \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_11.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..01d405d9f3f5976126ee023a37fc706404596867 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_11.txt @@ -0,0 +1,2 @@ +The model shapes are a bit different - larger d_model and smaller + num_heads and d_ff. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_12.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..adee764e1b43d85bc0ca90c86d319901c5806faa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_12.txt @@ -0,0 +1,2 @@ +Note: T5 Version 1.1 was only pre-trained on C4 excluding any supervised +training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_13.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..517302f0373df42f5b8daa883b23c7dbe8a1c1a6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_13.txt @@ -0,0 +1,2 @@ +Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 +model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_14.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..5029ad25c094ff0e41a303c00d49119cbc232044 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_14.txt @@ -0,0 +1,2 @@ +Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task +fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_15.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b02171176dfea8686a381c899b340100c72beb4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_15.txt @@ -0,0 +1 @@ +If you are doing multi-task fine-tuning, you should use a prefix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_16.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd1a90cfc35a976273ae26605372ce425cd1231e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_16.txt @@ -0,0 +1,11 @@ +Google has released the following variants: + +google/t5-v1_1-small + +google/t5-v1_1-base + +google/t5-v1_1-large + +google/t5-v1_1-xl + +google/t5-v1_1-xxl. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_17.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..c876a86f183d7372eefe7a374f721dae2577dbfa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_17.txt @@ -0,0 +1 @@ +Refer to T5's documentation page for all API reference, tips, code examples and notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_4.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..963080ac939e3da0afb982b88f47c3832af84395 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_4.txt @@ -0,0 +1,10 @@ +Usage tips +One can directly plug in the weights of T5v1.1 into a T5 model, like so: +thon + +from transformers import T5ForConditionalGeneration +model = T5ForConditionalGeneration.from_pretrained("google/t5-v1_1-base") + +T5 Version 1.1 includes the following improvements compared to the original T5 model: + +GEGLU activation in the feed-forward hidden layer, rather than ReLU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_5.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..89a53d84e4c183505110c69089b471c108c64cee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_5.txt @@ -0,0 +1 @@ +See this paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_6.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..40c303ef126fc467c6d62e2497576d2f6b6dae58 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_6.txt @@ -0,0 +1 @@ +Dropout was turned off in pre-training (quality win). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_7.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b4d6ca6ea7297795a895dd47e87b3ed36e33617 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_7.txt @@ -0,0 +1 @@ +Dropout should be re-enabled during fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_8.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..f53aa3b410424c9774e32b5317d0de035d4c7071 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_8.txt @@ -0,0 +1 @@ +Pre-trained on C4 only without mixing in the downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_9.txt b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8f2233f412c642a775829ef724519365fb5ee99 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_t5v1.1/chunk_9.txt @@ -0,0 +1 @@ +No parameter sharing between the embedding and classifier layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..516948c940a23d1f42c92ff5519a3712319aa9cc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_10.txt @@ -0,0 +1 @@ +Table detection and table structure recognition clarified. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_11.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..2455fd4fc88c163befae09a7f7629a4278430e77 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_12.txt @@ -0,0 +1,3 @@ +The authors released 2 models, one for table detection in +documents, one for table structure recognition +(the task of recognizing the individual rows, columns etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..44a59875c77b67014cd2fd584757481be8d3f508 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_13.txt @@ -0,0 +1 @@ +in a table). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_14.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf67fffa4f5d9001fd37fd419894f4c6591262b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_15.txt @@ -0,0 +1,2 @@ +The original code can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a6fa48da12167894f5bcdf7cb579a4468732bf6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_16.txt @@ -0,0 +1,3 @@ +Resources + +A demo notebook for the Table Transformer can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_17.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..0811f81fcfdf075564a5618fae6081e0e88d2025 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_17.txt @@ -0,0 +1 @@ +It turns out padding of images is quite important for detection. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_18.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a7bf9ac5fc5bb7abb2eb9b9e320bd7219447b96 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_18.txt @@ -0,0 +1 @@ +An interesting Github thread with replies from the authors can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_19.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..137544c895c26320d55dca0fb280f279037ea983 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_19.txt @@ -0,0 +1,8 @@ +TableTransformerConfig +[[autodoc]] TableTransformerConfig +TableTransformerModel +[[autodoc]] TableTransformerModel + - forward +TableTransformerForObjectDetection +[[autodoc]] TableTransformerForObjectDetection + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..69554504740eea6df4f888f6d87985cacf1ef362 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_7.txt @@ -0,0 +1,2 @@ +It also addresses a significant +source of ground truth inconsistency observed in prior datasets called oversegmentation, using a novel canonicalization procedure. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..88f8d8f556f3f5ccc353195f0059564035695113 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_8.txt @@ -0,0 +1,2 @@ +We demonstrate that these improvements lead to a +significant increase in training performance and a more reliable estimate of model performance at evaluation for table structure recognition. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_table-transformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f4580f8401adc60c334cf2d507fe265efa7392d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_table-transformer/chunk_9.txt @@ -0,0 +1,3 @@ +Further, we show that transformer-based +object detection models trained on PubTables-1M produce excellent results for all three tasks of detection, structure recognition, and functional analysis without the need for any +special customization for these tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_100.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2fbdeb074d8c2693df99f66ee0e5c8626e23677 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_100.txt @@ -0,0 +1,2 @@ +In case your dataset involves conversational questions (such as in SQA), then you should first group together the queries, answer_coordinates and answer_text per table (in the order of their position +index) and batch encode each table with its questions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_101.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..e97249999141049598d70b5d4e2ecea668b31a89 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_101.txt @@ -0,0 +1 @@ +This will make sure that the prev_labels token types (see docs of [TapasTokenizer]) are set correctly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_102.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..d04ca5013f6cf0d832ee5fc892edcae8a2af495b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_102.txt @@ -0,0 +1 @@ +See this notebook for more info. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_103.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bea07c73361b4a1727c3c27ed494308f92bddf7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_103.txt @@ -0,0 +1 @@ +See this notebook for more info regarding using the TensorFlow model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_104.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..003903a8aa1d086a4854f50108b6c5633e7ed222 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_104.txt @@ -0,0 +1,95 @@ +**STEP 4: Train (fine-tune) the model + +You can then fine-tune [TapasForQuestionAnswering] as follows (shown here for the weak supervision for aggregation case): + +from transformers import TapasConfig, TapasForQuestionAnswering, AdamW +this is the default WTQ configuration +config = TapasConfig( + num_aggregation_labels=4, + use_answer_as_supervision=True, + answer_loss_cutoff=0.664694, + cell_selection_preference=0.207951, + huber_loss_delta=0.121194, + init_cell_selection_weights_to_zero=True, + select_one_column=True, + allow_empty_column_selection=False, + temperature=0.0352513, + ) +model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) +optimizer = AdamW(model.parameters(), lr=5e-5) +model.train() +for epoch in range(2): # loop over the dataset multiple times + for batch in train_dataloader: + # get the inputs; + input_ids = batch["input_ids"] + attention_mask = batch["attention_mask"] + token_type_ids = batch["token_type_ids"] + labels = batch["labels"] + numeric_values = batch["numeric_values"] + numeric_values_scale = batch["numeric_values_scale"] + float_answer = batch["float_answer"] + + # zero the parameter gradients + optimizer.zero_grad() + # forward + backward + optimize + outputs = model( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + labels=labels, + numeric_values=numeric_values, + numeric_values_scale=numeric_values_scale, + float_answer=float_answer, + ) + loss = outputs.loss + loss.backward() + optimizer.step() +`` + + +You can then fine-tune [TFTapasForQuestionAnswering`] as follows (shown here for the weak supervision for aggregation case): + +import tensorflow as tf +from transformers import TapasConfig, TFTapasForQuestionAnswering +this is the default WTQ configuration +config = TapasConfig( + num_aggregation_labels=4, + use_answer_as_supervision=True, + answer_loss_cutoff=0.664694, + cell_selection_preference=0.207951, + huber_loss_delta=0.121194, + init_cell_selection_weights_to_zero=True, + select_one_column=True, + allow_empty_column_selection=False, + temperature=0.0352513, + ) +model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) +optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5) +for epoch in range(2): # loop over the dataset multiple times + for batch in train_dataloader: + # get the inputs; + input_ids = batch[0] + attention_mask = batch[1] + token_type_ids = batch[4] + labels = batch[-1] + numeric_values = batch[2] + numeric_values_scale = batch[3] + float_answer = batch[6] + + # forward + backward + optimize + with tf.GradientTape() as tape: + outputs = model( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + labels=labels, + numeric_values=numeric_values, + numeric_values_scale=numeric_values_scale, + float_answer=float_answer, + ) + grads = tape.gradient(outputs.loss, model.trainable_weights) + optimizer.apply_gradients(zip(grads, model.trainable_weights)) + +Usage: inference + +Here we explain how you can use [TapasForQuestionAnswering] or [TFTapasForQuestionAnswering] for inference (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_105.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8209161d20e7a6894529545e787bd4364942eb8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_105.txt @@ -0,0 +1 @@ +making predictions on new data). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_106.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c9c98cfa64239bd0868e9253bee3666d9847170 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_106.txt @@ -0,0 +1 @@ +For inference, only input_ids, attention_mask and token_type_ids (which you can obtain using [TapasTokenizer]) have to be provided to the model to obtain the logits. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_107.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a0183a62bfb977ae529b4edfef17b7dc29d0764 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_107.txt @@ -0,0 +1 @@ +Next, you can use the handy [~models.tapas.tokenization_tapas.convert_logits_to_predictions] method to convert these into predicted coordinates and optional aggregation indices. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_108.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..521f4fd76b7300672c4a2475398d8ff93227eec8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_108.txt @@ -0,0 +1 @@ +However, note that inference is different depending on whether or not the setup is conversational. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_109.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..deaab3f301203774e47d33a57b88624b8561e2b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_109.txt @@ -0,0 +1 @@ +In a non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_110.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..827d56b09c46e72e4d620710c72c8ba4797e85a6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_110.txt @@ -0,0 +1,10 @@ +Here's an example of that: + +from transformers import TapasTokenizer, TapasForQuestionAnswering +import pandas as pd +model_name = "google/tapas-base-finetuned-wtq" +model = TapasForQuestionAnswering.from_pretrained(model_name) +tokenizer = TapasTokenizer.from_pretrained(model_name) +data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +queries = [ + "What is the name of the first actor? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_111.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc0b4d030a4faa01bfdc209c9a0ebd794172310e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_111.txt @@ -0,0 +1,2 @@ +", + "How many movies has George Clooney played in? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_112.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d1305a9242abd78c29803d3145ee6cfe9d84b4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_112.txt @@ -0,0 +1,2 @@ +", + "What is the total number of movies? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_113.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..e75759860b6eb016741f814fe0cf65580387b18a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_113.txt @@ -0,0 +1,31 @@ +", + ] +table = pd.DataFrame.from_dict(data) +inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="pt") +outputs = model(**inputs) +predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( + inputs, outputs.logits.detach(), outputs.logits_aggregation.detach() + ) +let's print out the results: +id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"} +aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices] +answers = [] +for coordinates in predicted_answer_coordinates: + if len(coordinates) == 1: + # only a single cell: + answers.append(table.iat[coordinates[0]]) + else: + # multiple cells + cell_values = [] + for coordinate in coordinates: + cell_values.append(table.iat[coordinate]) + answers.append(", ".join(cell_values)) +display(table) +print("") +for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string): + print(query) + if predicted_agg == "NONE": + print("Predicted answer: " + answer) + else: + print("Predicted answer: " + predicted_agg + " > " + answer) +What is the name of the first actor? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_114.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..4245bcd9ff5452308beeb3998ca5b0a1b1b8b0a4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_114.txt @@ -0,0 +1,2 @@ +Predicted answer: Brad Pitt +How many movies has George Clooney played in? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_115.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..897707cbbcd9f05b006907ee903339f0c883c064 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_115.txt @@ -0,0 +1,2 @@ +Predicted answer: COUNT > 69 +What is the total number of movies? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_116.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..651321c85d4f1227ae7969ff83b46ce9a497c73e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_116.txt @@ -0,0 +1,5 @@ +Predicted answer: SUM > 87, 53, 69 +`` + + +Here we explain how you can use [TFTapasForQuestionAnswering] for inference (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_117.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8209161d20e7a6894529545e787bd4364942eb8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_117.txt @@ -0,0 +1 @@ +making predictions on new data). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_118.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..53583dff09ccbd84a92428e46ba3c9665637f49e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_118.txt @@ -0,0 +1 @@ +For inference, onlyinput_ids,attention_maskandtoken_type_ids(which you can obtain using [TapasTokenizer]) have to be provided to the model to obtain the logits. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_119.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..2eff5b235565a8914da3746b08d72256ac676fb7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_119.txt @@ -0,0 +1 @@ +Next, you can use the handy [~models.tapas.tokenization_tapas.convert_logits_to_predictions`] method to convert these into predicted coordinates and optional aggregation indices. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_120.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..521f4fd76b7300672c4a2475398d8ff93227eec8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_120.txt @@ -0,0 +1 @@ +However, note that inference is different depending on whether or not the setup is conversational. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_121.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..deaab3f301203774e47d33a57b88624b8561e2b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_121.txt @@ -0,0 +1 @@ +In a non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_122.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bd3454b84f83d5500f0dbc6f99f51eb8644924b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_122.txt @@ -0,0 +1,10 @@ +Here's an example of that: + +from transformers import TapasTokenizer, TFTapasForQuestionAnswering +import pandas as pd +model_name = "google/tapas-base-finetuned-wtq" +model = TFTapasForQuestionAnswering.from_pretrained(model_name) +tokenizer = TapasTokenizer.from_pretrained(model_name) +data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +queries = [ + "What is the name of the first actor? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_123.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc0b4d030a4faa01bfdc209c9a0ebd794172310e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_123.txt @@ -0,0 +1,2 @@ +", + "How many movies has George Clooney played in? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_124.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d1305a9242abd78c29803d3145ee6cfe9d84b4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_124.txt @@ -0,0 +1,2 @@ +", + "What is the total number of movies? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_125.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..c77a981cb84245f7e8c504d0b2959c16f0724647 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_125.txt @@ -0,0 +1,31 @@ +", + ] +table = pd.DataFrame.from_dict(data) +inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf") +outputs = model(**inputs) +predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( + inputs, outputs.logits, outputs.logits_aggregation + ) +let's print out the results: +id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"} +aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices] +answers = [] +for coordinates in predicted_answer_coordinates: + if len(coordinates) == 1: + # only a single cell: + answers.append(table.iat[coordinates[0]]) + else: + # multiple cells + cell_values = [] + for coordinate in coordinates: + cell_values.append(table.iat[coordinate]) + answers.append(", ".join(cell_values)) +display(table) +print("") +for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string): + print(query) + if predicted_agg == "NONE": + print("Predicted answer: " + answer) + else: + print("Predicted answer: " + predicted_agg + " > " + answer) +What is the name of the first actor? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_126.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..4245bcd9ff5452308beeb3998ca5b0a1b1b8b0a4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_126.txt @@ -0,0 +1,2 @@ +Predicted answer: Brad Pitt +How many movies has George Clooney played in? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_127.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..897707cbbcd9f05b006907ee903339f0c883c064 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_127.txt @@ -0,0 +1,2 @@ +Predicted answer: COUNT > 69 +What is the total number of movies? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_128.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..191c6042710494847dcf7a6306268d1b20346f98 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_128.txt @@ -0,0 +1,3 @@ +Predicted answer: SUM > 87, 53, 69 + +In case of a conversational set-up, then each table-question pair must be provided sequentially to the model, such that the prev_labels token types can be overwritten by the predicted labels of the previous table-question pair. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_129.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae1dd22a158adf674aeccf9c4168d2e7da337522 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_129.txt @@ -0,0 +1 @@ +Again, more info can be found in this notebook (for PyTorch) and this notebook (for TensorFlow). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_130.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..b40b683d302eb61fc1ebbf7372acd5276b7fcea3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_130.txt @@ -0,0 +1,40 @@ +Resources + +Text classification task guide +Masked language modeling task guide + +TAPAS specific outputs +[[autodoc]] models.tapas.modeling_tapas.TableQuestionAnsweringOutput +TapasConfig +[[autodoc]] TapasConfig +TapasTokenizer +[[autodoc]] TapasTokenizer + - call + - convert_logits_to_predictions + - save_vocabulary + +TapasModel +[[autodoc]] TapasModel + - forward +TapasForMaskedLM +[[autodoc]] TapasForMaskedLM + - forward +TapasForSequenceClassification +[[autodoc]] TapasForSequenceClassification + - forward +TapasForQuestionAnswering +[[autodoc]] TapasForQuestionAnswering + - forward + +TFTapasModel +[[autodoc]] TFTapasModel + - call +TFTapasForMaskedLM +[[autodoc]] TFTapasForMaskedLM + - call +TFTapasForSequenceClassification +[[autodoc]] TFTapasForSequenceClassification + - call +TFTapasForQuestionAnswering +[[autodoc]] TFTapasForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_48.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b3fb526293d46265f1ce348c9d77f91ee0777db --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_48.txt @@ -0,0 +1 @@ +WTQ: if you're not interested in asking questions in a conversational set-up, but rather just asking questions related to a table, which might involve aggregation, such as counting a number of rows, summing up cell values or averaging cell values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_49.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..98351f6362a0294d6a92ce02867fd6eff053b466 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_49.txt @@ -0,0 +1 @@ +You can then for example ask "what's the total number of goals Cristiano Ronaldo made in his career?". \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_50.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..001c5c71a53ec374643f3ef9c0bb4195dc3b89be --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_50.txt @@ -0,0 +1 @@ +This case is also called weak supervision, since the model itself must learn the appropriate aggregation operator (SUM/COUNT/AVERAGE/NONE) given only the answer to the question as supervision. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_51.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b4f10dd55991c9f00a94d3d10f5103c0a57f1a1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_51.txt @@ -0,0 +1 @@ +WikiSQL-supervised: this dataset is based on WikiSQL with the model being given the ground truth aggregation operator during training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_52.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4c8b1bcd85fa3776e33db602a8ddc02dc44db23 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_52.txt @@ -0,0 +1 @@ +This is also called strong supervision. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_53.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc4fd0792dfbe30bb3d448b84ffdc6341af0b923 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_53.txt @@ -0,0 +1 @@ +Here, learning the appropriate aggregation operator is much easier. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_54.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..000abaddb01f143005ae4230d3befa1dafb7dc31 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_54.txt @@ -0,0 +1,8 @@ +To summarize: +| Task | Example dataset | Description | +|-------------------------------------|---------------------|---------------------------------------------------------------------------------------------------------| +| Conversational | SQA | Conversational, only cell selection questions | +| Weak supervision for aggregation | WTQ | Questions might involve aggregation, and the model must learn this given only the answer as supervision | +| Strong supervision for aggregation | WikiSQL-supervised | Questions might involve aggregation, and the model must learn this given the gold aggregation operator | + +Initializing a model with a pre-trained base and randomly initialized classification heads from the hub can be done as shown below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_55.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..c696abce50acaa084c0ef5897de9c06224611d97 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_55.txt @@ -0,0 +1,11 @@ +from transformers import TapasConfig, TapasForQuestionAnswering +for example, the base sized model with default SQA configuration +model = TapasForQuestionAnswering.from_pretrained("google/tapas-base") +or, the base sized model with WTQ configuration +config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq") +model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) +or, the base sized model with WikiSQL configuration +config = TapasConfig("google-base-finetuned-wikisql-supervised") +model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) + +Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_56.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ff43fcaa6f6a433c717b00c1558e7d54a2e1b16 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_56.txt @@ -0,0 +1 @@ +You can also experiment by defining any hyperparameters you want when initializing [TapasConfig], and then create a [TapasForQuestionAnswering] based on that configuration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_57.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..bae16fa2a2b7984133f99be2289b71eb30ab30a8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_57.txt @@ -0,0 +1 @@ +For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_58.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8290b65465dca8cab94acce19295fd469d78f0f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_58.txt @@ -0,0 +1,9 @@ +Here's an example: + +from transformers import TapasConfig, TapasForQuestionAnswering +you can initialize the classification heads any way you want (see docs of TapasConfig) +config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True) +initializing the pre-trained base sized model with our custom classification heads +model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) + +Initializing a model with a pre-trained base and randomly initialized classification heads from the hub can be done as shown below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_59.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a16a60603b929e7be39067196c8c284f87093e1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_59.txt @@ -0,0 +1,13 @@ +Be sure to have installed the tensorflow_probability dependency: + +from transformers import TapasConfig, TFTapasForQuestionAnswering +for example, the base sized model with default SQA configuration +model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base") +or, the base sized model with WTQ configuration +config = TapasConfig.from_pretrained("google/tapas-base-finetuned-wtq") +model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) +or, the base sized model with WikiSQL configuration +config = TapasConfig("google-base-finetuned-wikisql-supervised") +model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) + +Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_60.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..51f510d53fc5b8de95afb4bf62ec729c3d6451c1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_60.txt @@ -0,0 +1 @@ +You can also experiment by defining any hyperparameters you want when initializing [TapasConfig], and then create a [TFTapasForQuestionAnswering] based on that configuration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_61.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..bae16fa2a2b7984133f99be2289b71eb30ab30a8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_61.txt @@ -0,0 +1 @@ +For example, if you have a dataset that has both conversational questions and questions that might involve aggregation, then you can do it this way. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_62.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..408b4f12d2475e99aa6d996c922681b925d29296 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_62.txt @@ -0,0 +1,9 @@ +Here's an example: + +from transformers import TapasConfig, TFTapasForQuestionAnswering +you can initialize the classification heads any way you want (see docs of TapasConfig) +config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True) +initializing the pre-trained base sized model with our custom classification heads +model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) + +What you can also do is start from an already fine-tuned checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_63.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..f451f32745775e26f240dfe2607ff3de6b2b883a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_63.txt @@ -0,0 +1 @@ +A note here is that the already fine-tuned checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_64.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..34bea41dc4c2b3b547a070c1e32bcccad6ff9881 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_64.txt @@ -0,0 +1 @@ +See here for more info. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_65.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac1a7b9eda3603647b2d5db18308ca9d695837d0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_65.txt @@ -0,0 +1 @@ +For a list of all pre-trained and fine-tuned TAPAS checkpoints available on HuggingFace's hub, see here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_66.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..0aa305260f8cd17857610480ad1c8d40c07aea2c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_66.txt @@ -0,0 +1,2 @@ +STEP 2: Prepare your data in the SQA format +Second, no matter what you picked above, you should prepare your dataset in the SQA format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_67.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..a046d5aa0ba66ad7b6cc19c767fe74b1b3b93939 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_67.txt @@ -0,0 +1,3 @@ +This format is a TSV/CSV file with the following columns: + +id: optional, id of the table-question pair, for bookkeeping purposes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_68.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..393715bba715c6b1585df41d116bfca8a6f67095 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_68.txt @@ -0,0 +1 @@ +annotator: optional, id of the person who annotated the table-question pair, for bookkeeping purposes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_69.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..887e8ed2f560457bcb17b9f0060e5cd7375c167a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_69.txt @@ -0,0 +1 @@ +position: integer indicating if the question is the first, second, third, related to the table. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_70.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..73b94c86317fb6a29b247149e78ceb15ff731315 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_70.txt @@ -0,0 +1 @@ +Only required in case of conversational setup (SQA). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_71.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea18dd81a2d8bf8cd6ef2162ade099f59e273432 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_71.txt @@ -0,0 +1 @@ +You don't need this column in case you're going for WTQ/WikiSQL-supervised. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_72.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d0efb1e79fccdda1848790ed1c94524315b350b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_72.txt @@ -0,0 +1,3 @@ +question: string +table_file: string, name of a csv file containing the tabular data +answer_coordinates: list of one or more tuples (each tuple being a cell coordinate, i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_73.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..685c722844f2089717701eed862df1d7523edea5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_73.txt @@ -0,0 +1,3 @@ +row, column pair that is part of the answer) +answer_text: list of one or more strings (each string being a cell value that is part of the answer) +aggregation_label: index of the aggregation operator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_74.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..30cce9e5fbe33e18fa7d7542654723892c50b90d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_74.txt @@ -0,0 +1,2 @@ +Only required in case of strong supervision for aggregation (the WikiSQL-supervised case) +float_answer: the float answer to the question, if there is one (np.nan if there isn't). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_75.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..81f34496d3738e83e19615901589314e8c068e56 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_75.txt @@ -0,0 +1,3 @@ +Only required in case of weak supervision for aggregation (such as WTQ and WikiSQL) + +The tables themselves should be present in a folder, each table being a separate csv file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_76.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ad198c9b8914070cebcac57cbd9f11d856319d4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_76.txt @@ -0,0 +1 @@ +Note that the authors of the TAPAS algorithm used conversion scripts with some automated logic to convert the other datasets (WTQ, WikiSQL) into the SQA format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_77.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e663c9e60a803ff88a28837e652e0e50a611765 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_77.txt @@ -0,0 +1 @@ +The author explains this here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_78.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..83767faa91e1083ceb7950555d6f76ab3eb6f5b2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_78.txt @@ -0,0 +1 @@ +A conversion of this script that works with HuggingFace's implementation can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_79.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..aac767da2d78a093651e002ba08e60d18cc943b3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_79.txt @@ -0,0 +1 @@ +Interestingly, these conversion scripts are not perfect (the answer_coordinates and float_answer fields are populated based on the answer_text), meaning that WTQ and WikiSQL results could actually be improved. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_80.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b01374dad45c506a6a62ed29a1e41e2af3b0f47 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_80.txt @@ -0,0 +1,3 @@ +STEP 3: Convert your data into tensors using TapasTokenizer + +Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use [TapasTokenizer] to convert table-question pairs into input_ids, attention_mask, token_type_ids and so on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_81.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..7922cd4b805b867f20b9b12f77e21508a5733d53 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_81.txt @@ -0,0 +1,8 @@ +Again, based on which of the three cases you picked above, [TapasForQuestionAnswering] requires different +inputs to be fine-tuned: +| Task | Required inputs | +|------------------------------------|---------------------------------------------------------------------------------------------------------------------| +| Conversational | input_ids, attention_mask, token_type_ids, labels | +| Weak supervision for aggregation | input_ids, attention_mask, token_type_ids, labels, numeric_values, numeric_values_scale, float_answer | +| Strong supervision for aggregation | input ids, attention mask, token type ids, labels, aggregation_labels | +[TapasTokenizer] creates the labels, numeric_values and numeric_values_scale based on the answer_coordinates and answer_text columns of the TSV file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_82.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..36d57705b5235eddb1e555e332fbc5809cbbfcf9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_82.txt @@ -0,0 +1 @@ +The float_answer and aggregation_labels are already in the TSV file of step 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_83.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b8460950be03e086b98b8da88154d5e029082f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_83.txt @@ -0,0 +1,9 @@ +Here's an example: + +from transformers import TapasTokenizer +import pandas as pd +model_name = "google/tapas-base" +tokenizer = TapasTokenizer.from_pretrained(model_name) +data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +queries = [ + "What is the name of the first actor? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_84.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc0b4d030a4faa01bfdc209c9a0ebd794172310e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_84.txt @@ -0,0 +1,2 @@ +", + "How many movies has George Clooney played in? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_85.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d1305a9242abd78c29803d3145ee6cfe9d84b4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_85.txt @@ -0,0 +1,2 @@ +", + "What is the total number of movies? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_86.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f6d74d5711a1f64479ba46551612d3b391fb44c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_86.txt @@ -0,0 +1,18 @@ +", + ] +answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]] +answer_text = [["Brad Pitt"], ["69"], ["209"]] +table = pd.DataFrame.from_dict(data) +inputs = tokenizer( + table=table, + queries=queries, + answer_coordinates=answer_coordinates, + answer_text=answer_text, + padding="max_length", + return_tensors="pt", + ) +inputs +{'input_ids': tensor([[ ]]), 'attention_mask': tensor([[]]), 'token_type_ids': tensor([[[]]]), +'numeric_values': tensor([[ ]]), 'numeric_values_scale: tensor([[ ]]), labels: tensor([[ ]])} + +Note that [TapasTokenizer] expects the data of the table to be text-only. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_87.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a3b7984ea88959c4cf3ca4e868fd4f0d0684474 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_87.txt @@ -0,0 +1 @@ +You can use .astype(str) on a dataframe to turn it into text-only data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_88.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..f971d81b03b58cf9341b51346d4c01c51a0e561f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_88.txt @@ -0,0 +1 @@ +Of course, this only shows how to encode a single training example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_89.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..06761a15608f30a06bb5d8c576219e548bb15735 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_89.txt @@ -0,0 +1,40 @@ +It is advised to create a dataloader to iterate over batches: + +import torch +import pandas as pd +tsv_path = "your_path_to_the_tsv_file" +table_csv_path = "your_path_to_a_directory_containing_all_csv_files" +class TableDataset(torch.utils.data.Dataset): + def init(self, data, tokenizer): + self.data = data + self.tokenizer = tokenizer + + def getitem(self, idx): + item = data.iloc[idx] + table = pd.read_csv(table_csv_path + item.table_file).astype( + str + ) # be sure to make your table data text only + encoding = self.tokenizer( + table=table, + queries=item.question, + answer_coordinates=item.answer_coordinates, + answer_text=item.answer_text, + truncation=True, + padding="max_length", + return_tensors="pt", + ) + # remove the batch dimension which the tokenizer adds by default + encoding = {key: val.squeeze(0) for key, val in encoding.items()} + # add the float_answer which is also required (weak supervision for aggregation case) + encoding["float_answer"] = torch.tensor(item.float_answer) + return encoding + def len(self): + return len(self.data) + +data = pd.read_csv(tsv_path, sep="\t") +train_dataset = TableDataset(data, tokenizer) +train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32) +`` + + +Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use [TapasTokenizer] to convert table-question pairs intoinput_ids,attention_mask,token_type_idsand so on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_90.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..f364d9538b2ebe6779c4cbdb61cd088b1dfa3eb3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_90.txt @@ -0,0 +1,9 @@ +Again, based on which of the three cases you picked above, [TFTapasForQuestionAnswering`] requires different +inputs to be fine-tuned: + +| Task | Required inputs | +|------------------------------------|---------------------------------------------------------------------------------------------------------------------| +| Conversational | input_ids, attention_mask, token_type_ids, labels | +| Weak supervision for aggregation | input_ids, attention_mask, token_type_ids, labels, numeric_values, numeric_values_scale, float_answer | +| Strong supervision for aggregation | input ids, attention mask, token type ids, labels, aggregation_labels | +[TapasTokenizer] creates the labels, numeric_values and numeric_values_scale based on the answer_coordinates and answer_text columns of the TSV file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_91.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..36d57705b5235eddb1e555e332fbc5809cbbfcf9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_91.txt @@ -0,0 +1 @@ +The float_answer and aggregation_labels are already in the TSV file of step 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_92.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b8460950be03e086b98b8da88154d5e029082f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_92.txt @@ -0,0 +1,9 @@ +Here's an example: + +from transformers import TapasTokenizer +import pandas as pd +model_name = "google/tapas-base" +tokenizer = TapasTokenizer.from_pretrained(model_name) +data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +queries = [ + "What is the name of the first actor? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_93.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc0b4d030a4faa01bfdc209c9a0ebd794172310e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_93.txt @@ -0,0 +1,2 @@ +", + "How many movies has George Clooney played in? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_94.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d1305a9242abd78c29803d3145ee6cfe9d84b4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_94.txt @@ -0,0 +1,2 @@ +", + "What is the total number of movies? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_95.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f8af4010940f3eb5006199e9abed98c9ff52740 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_95.txt @@ -0,0 +1,18 @@ +", + ] +answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]] +answer_text = [["Brad Pitt"], ["69"], ["209"]] +table = pd.DataFrame.from_dict(data) +inputs = tokenizer( + table=table, + queries=queries, + answer_coordinates=answer_coordinates, + answer_text=answer_text, + padding="max_length", + return_tensors="tf", + ) +inputs +{'input_ids': tensor([[ ]]), 'attention_mask': tensor([[]]), 'token_type_ids': tensor([[[]]]), +'numeric_values': tensor([[ ]]), 'numeric_values_scale: tensor([[ ]]), labels: tensor([[ ]])} + +Note that [TapasTokenizer] expects the data of the table to be text-only. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_96.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a3b7984ea88959c4cf3ca4e868fd4f0d0684474 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_96.txt @@ -0,0 +1 @@ +You can use .astype(str) on a dataframe to turn it into text-only data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_97.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..f971d81b03b58cf9341b51346d4c01c51a0e561f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_97.txt @@ -0,0 +1 @@ +Of course, this only shows how to encode a single training example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_98.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6257242fb27c84fcde892e653a95909ccbd55fb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_98.txt @@ -0,0 +1,50 @@ +It is advised to create a dataloader to iterate over batches: + +import tensorflow as tf +import pandas as pd +tsv_path = "your_path_to_the_tsv_file" +table_csv_path = "your_path_to_a_directory_containing_all_csv_files" +class TableDataset: + def init(self, data, tokenizer): + self.data = data + self.tokenizer = tokenizer + + def iter(self): + for idx in range(self.len()): + item = self.data.iloc[idx] + table = pd.read_csv(table_csv_path + item.table_file).astype( + str + ) # be sure to make your table data text only + encoding = self.tokenizer( + table=table, + queries=item.question, + answer_coordinates=item.answer_coordinates, + answer_text=item.answer_text, + truncation=True, + padding="max_length", + return_tensors="tf", + ) + # remove the batch dimension which the tokenizer adds by default + encoding = {key: tf.squeeze(val, 0) for key, val in encoding.items()} + # add the float_answer which is also required (weak supervision for aggregation case) + encoding["float_answer"] = tf.convert_to_tensor(item.float_answer, dtype=tf.float32) + yield encoding["input_ids"], encoding["attention_mask"], encoding["numeric_values"], encoding[ + "numeric_values_scale" + ], encoding["token_type_ids"], encoding["labels"], encoding["float_answer"] + def len(self): + return len(self.data) + +data = pd.read_csv(tsv_path, sep="\t") +train_dataset = TableDataset(data, tokenizer) +output_signature = ( + tf.TensorSpec(shape=(512,), dtype=tf.int32), + tf.TensorSpec(shape=(512,), dtype=tf.int32), + tf.TensorSpec(shape=(512,), dtype=tf.float32), + tf.TensorSpec(shape=(512,), dtype=tf.float32), + tf.TensorSpec(shape=(512, 7), dtype=tf.int32), + tf.TensorSpec(shape=(512,), dtype=tf.int32), + tf.TensorSpec(shape=(512,), dtype=tf.float32), + ) +train_dataloader = tf.data.Dataset.from_generator(train_dataset, output_signature=output_signature).batch(32) + +Note that here, we encode each table-question pair independently. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapas/chunk_99.txt b/chunked/content_aware_chunking/model_doc_tapas/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba04cd1f28d941f7d4257f4a2b1276a767d32345 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapas/chunk_99.txt @@ -0,0 +1 @@ +This is fine as long as your dataset is not conversational. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_14.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa8d282d6fb60f03ab1474bc3109b5b70de9dfb9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_14.txt @@ -0,0 +1,3 @@ +Usage tips + +TAPEX is a generative (seq2seq) model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_15.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..098e56fb7405a2527416021fd72a35d2bd8f8de7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_15.txt @@ -0,0 +1 @@ +One can directly plug in the weights of TAPEX into a BART model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_16.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d08f1a7e04fad87e0f6a3ae7bdbdcd2bef7e534 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_16.txt @@ -0,0 +1 @@ +TAPEX has checkpoints on the hub that are either pre-trained only, or fine-tuned on WTQ, SQA, WikiSQL and TabFact. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_17.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fc7ee67a60327a7b664c36e598308b351a7d060 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_17.txt @@ -0,0 +1 @@ +Sentences + tables are presented to the model as sentence + " " + linearized table. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_18.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..7463b416c0eb98fa6260d66540d3e10332558acc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_18.txt @@ -0,0 +1,2 @@ +The linearized table has the following format: + col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : . \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_19.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..85b117468f6472866b2c50f65677b30b39ad06f5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_19.txt @@ -0,0 +1 @@ +TAPEX has its own tokenizer, that allows to prepare all data for the model easily. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_20.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ee8f303470f3c8799f24fe254f35dfbb8031f18 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_20.txt @@ -0,0 +1,2 @@ +One can pass Pandas DataFrames and strings to the tokenizer, + and it will automatically create the input_ids and attention_mask (as shown in the usage examples below). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_21.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..96648f9fd60da604cb7d0951e1e13cf785ecde88 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_21.txt @@ -0,0 +1,2 @@ +Usage: inference +Below, we illustrate how to use TAPEX for table question answering. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_22.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..2207e72dcb200a13d85570a492ac82e26683026c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_22.txt @@ -0,0 +1 @@ +As one can see, one can directly plug in the weights of TAPEX into a BART model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_23.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..e101117567593e11cb2cd7d216b5bafc9add0894 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_23.txt @@ -0,0 +1,2 @@ +We use the Auto API, which will automatically instantiate the appropriate tokenizer ([TapexTokenizer]) and model ([BartForConditionalGeneration]) for us, +based on the configuration file of the checkpoint on the hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_24.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..60473999b283e2d9ac04fe1621106741850df5a6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_24.txt @@ -0,0 +1,10 @@ +thon + +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +import pandas as pd +tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-wtq") +model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/tapex-large-finetuned-wtq") +prepare table + question +data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +table = pd.DataFrame.from_dict(data) +question = "how many movies does Leonardo Di Caprio have?" \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_25.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5eb99feee033599263b7113b6d82285acbb150c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_25.txt @@ -0,0 +1,9 @@ +encoding = tokenizer(table, question, return_tensors="pt") +let the model generate an answer autoregressively +outputs = model.generate(**encoding) +decode back to text +predicted_answer = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] +print(predicted_answer) +53 + +Note that [TapexTokenizer] also supports batched inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_26.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..aeb18cef534a7789484f3f02e8988fadac731e5b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_26.txt @@ -0,0 +1,2 @@ +Hence, one can provide a batch of different tables/questions, or a batch of a single table +and multiple questions, or a batch of a single query and multiple tables. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_27.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..efd3c58b9499c64212706d25ea56a4f37bec8e71 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_27.txt @@ -0,0 +1,8 @@ +Let's illustrate this: +thon + +prepare table + question +data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +table = pd.DataFrame.from_dict(data) +questions = [ + "how many movies does Leonardo Di Caprio have? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_28.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..8601bf669a13832578d1e8f142505b9652c669f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_28.txt @@ -0,0 +1,2 @@ +", + "which actor has 69 movies? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_29.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..25f5979b25ec51687d28932fe0f652557ddc3a7b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_29.txt @@ -0,0 +1,2 @@ +", + "what's the first name of the actor who has 87 movies? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_30.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcc554dbb70e8563599ec0eecc3fd208929a2ebc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_30.txt @@ -0,0 +1,10 @@ +", + ] +encoding = tokenizer(table, questions, padding=True, return_tensors="pt") +let the model generate an answer autoregressively +outputs = model.generate(**encoding) +decode back to text +tokenizer.batch_decode(outputs, skip_special_tokens=True) +[' 53', ' george clooney', ' brad pitt'] + +In case one wants to do table verification (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_31.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b6fdda39093f9ce19a36b9c77784789a9da7eee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_31.txt @@ -0,0 +1,2 @@ +the task of determining whether a given sentence is supported or refuted by the contents +of a table), one can instantiate a [BartForSequenceClassification] model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_32.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a02cf9b98d0f4edd1ca0e5d8416aaf80b675e75 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_32.txt @@ -0,0 +1,2 @@ +TAPEX has checkpoints on the hub fine-tuned on TabFact, an important +benchmark for table fact checking (it achieves 84% accuracy). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_33.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..f83b724ba335c8fd325e9d1631b82c4d1b28959d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_33.txt @@ -0,0 +1 @@ +The code example below again leverages the Auto API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_34.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7163a1ce73b6ec6ad38ab40d142151e7603d2de --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_34.txt @@ -0,0 +1,19 @@ +thon + +from transformers import AutoTokenizer, AutoModelForSequenceClassification +tokenizer = AutoTokenizer.from_pretrained("microsoft/tapex-large-finetuned-tabfact") +model = AutoModelForSequenceClassification.from_pretrained("microsoft/tapex-large-finetuned-tabfact") +prepare table + sentence +data = {"Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], "Number of movies": ["87", "53", "69"]} +table = pd.DataFrame.from_dict(data) +sentence = "George Clooney has 30 movies" +encoding = tokenizer(table, sentence, return_tensors="pt") +forward pass +outputs = model(**encoding) +print prediction +predicted_class_idx = outputs.logits[0].argmax(dim=0).item() +print(model.config.id2label[predicted_class_idx]) +Refused + + +TAPEX architecture is the same as BART, except for tokenization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_35.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2f27e74075dbfffd0a9d06b9ec1a29851964c9d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_35.txt @@ -0,0 +1,2 @@ +Refer to BART documentation for information on +configuration classes and their parameters. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_36.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cb106485598c7908e2341cd54df73c01ea5aa8e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_36.txt @@ -0,0 +1 @@ +TAPEX-specific tokenizer is documented below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tapex/chunk_37.txt b/chunked/content_aware_chunking/model_doc_tapex/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..af260c4c9b6846334a909346850f8e2d26169121 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tapex/chunk_37.txt @@ -0,0 +1,4 @@ +TapexTokenizer +[[autodoc]] TapexTokenizer + - call + - save_vocabulary \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..935ef016562b4475b1de699d780f3f4186733213 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_10.txt @@ -0,0 +1 @@ +These serve as "positional encodings" for the Transformer encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9891a155eda1056ed5ead1269f1a04422073fe1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_11.txt @@ -0,0 +1 @@ +Examples are "day of the month", "month of the year", etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bb16f4a39fecb7a9faeb4fded2dce3af6dbd871 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_12.txt @@ -0,0 +1 @@ +as scalar values (and then stacked together as a vector). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..405c3635f7723d2eb4b2fd1144346335f90603c1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_13.txt @@ -0,0 +1 @@ +e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c9633391a649860de69e4c5be599f7d1979446f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_14.txt @@ -0,0 +1 @@ +if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year"). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b0eaaf22cda7f2b4776296bdec07b4690facedf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_15.txt @@ -0,0 +1 @@ +future_time_features: temporal features which the model will add to future_values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fb2ff5bdb0aa7722e695f7f266185aef1d21a64 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_16.txt @@ -0,0 +1 @@ +These serve as "positional encodings" for the Transformer decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_17.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9891a155eda1056ed5ead1269f1a04422073fe1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_17.txt @@ -0,0 +1 @@ +Examples are "day of the month", "month of the year", etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_18.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bb16f4a39fecb7a9faeb4fded2dce3af6dbd871 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_18.txt @@ -0,0 +1 @@ +as scalar values (and then stacked together as a vector). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_19.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..405c3635f7723d2eb4b2fd1144346335f90603c1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_19.txt @@ -0,0 +1 @@ +e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_20.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c9633391a649860de69e4c5be599f7d1979446f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_20.txt @@ -0,0 +1 @@ +if a given time-series value was obtained on the 11th of August, then one could have [11, 8] as time feature vector (11 being "day of the month", 8 being "month of the year"). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_21.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..35e2c0ad0a893aaf6c8dbc871189712fb8f57254 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_21.txt @@ -0,0 +1 @@ +static_categorical_features: categorical features which are static over time (i.e., have the same value for all past_values and future_values). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_22.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc39f861ebbd767a6f48fcac04e66e221366a0d1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_22.txt @@ -0,0 +1 @@ +An example here is the store ID or region ID that identifies a given time-series. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_23.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..1eb491d674fe3b4d5c7e95e0f33c60299ced2ce2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_23.txt @@ -0,0 +1 @@ +Note that these features need to be known for ALL data points (also those in the future). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_24.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d0157b017cb7edd131758619c7e5d42aa079fa2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_24.txt @@ -0,0 +1 @@ +static_real_features: real-valued features which are static over time (i.e., have the same value for all past_values and future_values). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_25.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..e26c9b6e1dc025af7264c05e955783040b8aaae9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_25.txt @@ -0,0 +1,2 @@ +An example here is the image representation of the product for which you have the time-series values (like the ResNet embedding of a "shoe" picture, +if your time-series is about the sales of shoes). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_26.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..1eb491d674fe3b4d5c7e95e0f33c60299ced2ce2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_26.txt @@ -0,0 +1 @@ +Note that these features need to be known for ALL data points (also those in the future). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_27.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..6bc28480d5405102e25b3f0bf0d692c7cac1d003 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_27.txt @@ -0,0 +1 @@ +The model is trained using "teacher-forcing", similar to how a Transformer is trained for machine translation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_28.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6bf24dc76494fa6b866129aba288060b01af7b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_28.txt @@ -0,0 +1,2 @@ +This means that, during training, one shifts the +future_values one position to the right as input to the decoder, prepended by the last value of past_values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_29.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1998a1727c75f51b96c75ec1d461da73dd78229 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_29.txt @@ -0,0 +1,2 @@ +At each time step, the model needs to predict the +next target. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_30.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..e12490943ec715a6969b44719a9d7dbe6397a784 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_30.txt @@ -0,0 +1,2 @@ +So the set-up of training is similar to a GPT model for language, except that there's no notion of decoder_start_token_id (we just use the last value +of the context as initial input for the decoder). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_31.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..291abf0b487199adac0d5ba1522a8d4f065b9923 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_31.txt @@ -0,0 +1 @@ +At inference time, we give the final value of the past_values as input to the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_32.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdb223adeaad061b27457d116ae81b34406ef98a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_32.txt @@ -0,0 +1,2 @@ +Next, we can sample from the model to make a prediction at the next time step, +which is then fed to the decoder in order to make the next prediction (also called autoregressive generation). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_33.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..da320e00afa0cc964a5a7ef63dbffe0646db1773 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_33.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_34.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_34.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_35.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_35.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_36.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab34487ae8fc4f34b89b882efe0bbda0350a7b65 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_36.txt @@ -0,0 +1,10 @@ +Check out the Time Series Transformer blog-post in HuggingFace blog: Probabilistic Time Series Forecasting with 🤗 Transformers + +TimeSeriesTransformerConfig +[[autodoc]] TimeSeriesTransformerConfig +TimeSeriesTransformerModel +[[autodoc]] TimeSeriesTransformerModel + - forward +TimeSeriesTransformerForPrediction +[[autodoc]] TimeSeriesTransformerForPrediction + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f8210d667e0b08fbb45f483003c2d5d4019458b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_8.txt @@ -0,0 +1 @@ +In addition to the raw (past_values and future_values), one typically provides additional features to the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..e46bf5fcb9fe8742e1d3c8358441339fe196ff70 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_time_series_transformer/chunk_9.txt @@ -0,0 +1,2 @@ +These can be the following: +past_time_features: temporal features which the model will add to past_values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_timesformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_timesformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05577051a6f508151ff98585dae5cdf8aaa59d5d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_timesformer/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by fcakyon. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_timesformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_timesformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_timesformer/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_timesformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_timesformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..0dae7a701d13925d7ffd3ff99d28ea9a13b6077c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_timesformer/chunk_12.txt @@ -0,0 +1,2 @@ +Usage tips +There are many pretrained variants. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_timesformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_timesformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6a063d855fbac5dc88a4ccfe1e4fde2b643edee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_timesformer/chunk_13.txt @@ -0,0 +1 @@ +Select your pretrained model based on the dataset it is trained on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_timesformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_timesformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..edcebb9dd588cf751a544324ff85980982f096cf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_timesformer/chunk_14.txt @@ -0,0 +1,2 @@ +Moreover, +the number of input frames per clip changes based on the model size so you should consider this parameter while selecting your pretrained model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_timesformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_timesformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..4948b78d3fe2a3818711d69004c58cac6d16ec92 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_timesformer/chunk_15.txt @@ -0,0 +1,12 @@ +Resources + +Video classification task guide + +TimesformerConfig +[[autodoc]] TimesformerConfig +TimesformerModel +[[autodoc]] TimesformerModel + - forward +TimesformerForVideoClassification +[[autodoc]] TimesformerForVideoClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_timesformer/chunk_5.txt b/chunked/content_aware_chunking/model_doc_timesformer/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..89ad8159184f8d3bf83da492cfd4403142f546b3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_timesformer/chunk_5.txt @@ -0,0 +1 @@ +Our method, named "TimeSformer," adapts the standard Transformer architecture to video by enabling spatiotemporal feature learning directly from a sequence of frame-level patches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_timesformer/chunk_6.txt b/chunked/content_aware_chunking/model_doc_timesformer/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..504a06d9ed6dc814b81f17e2bf00dce79d8c1add --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_timesformer/chunk_6.txt @@ -0,0 +1 @@ +Our experimental study compares different self-attention schemes and suggests that "divided attention," where temporal attention and spatial attention are separately applied within each block, leads to the best video classification accuracy among the design choices considered. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_timesformer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_timesformer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..33f4213501ca02ddb4c62b661d0b11692d6967e1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_timesformer/chunk_7.txt @@ -0,0 +1 @@ +Despite the radically new design, TimeSformer achieves state-of-the-art results on several action recognition benchmarks, including the best reported accuracy on Kinetics-400 and Kinetics-600. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_timesformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_timesformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f24ca90dab33e8cf3ba9f004cace7a7234f642c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_timesformer/chunk_8.txt @@ -0,0 +1 @@ +Finally, compared to 3D convolutional networks, our model is faster to train, it can achieve dramatically higher test efficiency (at a small drop in accuracy), and it can also be applied to much longer video clips (over one minute long). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_timesformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_timesformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2aa023b3f33bda903a1c00d7e956908ca01a6f6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_timesformer/chunk_9.txt @@ -0,0 +1 @@ +Code and models are available at: this https URL. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_10.txt b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a1666164cfaac8ac75bcf9612e7f2deae3e37ea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_10.txt @@ -0,0 +1,2 @@ +Further, we show that this approach can be combined with +existing model-free algorithms to yield a state-of-the-art planner in sparse-reward, long-horizon tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_11.txt b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..970dc5ed40526251c05d139e1955cf2c7f9cbf31 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by CarlCochet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_12.txt b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_13.txt b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a3d06f922a5d87bfad5e28d88f2545491173374 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_13.txt @@ -0,0 +1,2 @@ +Usage tips +This Transformer is used for deep reinforcement learning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_14.txt b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..13ed755f92f4617bf68a5719fbbf3f575530580e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_14.txt @@ -0,0 +1,2 @@ +To use it, you need to create sequences from +actions, states and rewards from all previous timesteps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_15.txt b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..53c84bdc2e4d6ce7630bc68fd6ff4085e1cbfc71 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_15.txt @@ -0,0 +1,2 @@ +This model will treat all these elements together +as one big sequence (a trajectory). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_16.txt b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c3ec3a7837e8bb86c11516fe19dc8fa3c3ea3d8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_16.txt @@ -0,0 +1,5 @@ +TrajectoryTransformerConfig +[[autodoc]] TrajectoryTransformerConfig +TrajectoryTransformerModel +[[autodoc]] TrajectoryTransformerModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_6.txt b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd1c62a8bcd42b636646d52b291fcf6e7b2cf94e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_6.txt @@ -0,0 +1,2 @@ +Viewed in this way, it is tempting to consider whether high-capacity sequence prediction models that work well +in other domains, such as natural-language processing, can also provide effective solutions to the RL problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..de7aeec4af6c24df7922d1789f9d152927833a09 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_7.txt @@ -0,0 +1,2 @@ +To this end, we explore how RL can be tackled with the tools of sequence modeling, using a Transformer architecture +to model distributions over trajectories and repurposing beam search as a planning algorithm. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..112f6fcb09c31f499dbb5d00866d464f2dfdbb5d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_8.txt @@ -0,0 +1,3 @@ +Framing RL as sequence +modeling problem simplifies a range of design decisions, allowing us to dispense with many of the components common +in offline RL algorithms. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..a00b9bd66e5942301b96e6601a9511faa9505760 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trajectory_transformer/chunk_9.txt @@ -0,0 +1,2 @@ +We demonstrate the flexibility of this approach across long-horizon dynamics prediction, +imitation learning, goal-conditioned RL, and offline RL. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_10.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..a58743844b401e010494b5175f07979217497307 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_10.txt @@ -0,0 +1,2 @@ +We propose a novel neural architecture Transformer-XL that enables learning dependency +beyond a fixed length without disrupting temporal coherence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_11.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..f56f22614debe2f06354361cd243b2a26e01a996 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_11.txt @@ -0,0 +1,2 @@ +It consists of a segment-level recurrence mechanism and a +novel positional encoding scheme. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_12.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..446b42ff422117fe9d274ff15e39ca85d7cedc6a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_12.txt @@ -0,0 +1,2 @@ +Our method not only enables capturing longer-term dependency, but also resolves the +context fragmentation problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_13.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..27e3f14c801464b4819abed1a016257ad091dea7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_13.txt @@ -0,0 +1,3 @@ +As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450% +longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+ +times faster than vanilla Transformers during evaluation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_14.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a6015bd3317157a5fcd715b65d69a1240350289 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_14.txt @@ -0,0 +1,3 @@ +Notably, we improve the state-of-the-art results of +bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn +Treebank (without finetuning). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_15.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..c76cd69c4c3d11ad063326553704349f3f739802 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_15.txt @@ -0,0 +1,2 @@ +When trained only on WikiText-103, Transformer-XL manages to generate reasonably +coherent, novel text articles with thousands of tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_16.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..649897ffdb873807ea08b834f0650ff587d9718e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_16.txt @@ -0,0 +1 @@ +This model was contributed by thomwolf. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_17.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_17.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_18.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..25a42c5f0fd96a9babf5867f7855a304498fd224 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_18.txt @@ -0,0 +1,3 @@ +Usage tips + +Transformer-XL uses relative sinusoidal positional embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_19.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a6429a33386f09588b4ea83978b57e161426e32 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_19.txt @@ -0,0 +1 @@ +Padding can be done on the left or on the right. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_20.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e43bc9689860fe18ed986bed2e672fa10d610b5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_20.txt @@ -0,0 +1,2 @@ +The + original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_21.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..0343e48852e77107a9318c97bdfdd74417ab0270 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_21.txt @@ -0,0 +1 @@ +Transformer-XL is one of the few models that has no sequence length limit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_22.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd0602dc130a96cba2aa753280e5c6af275f990e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_22.txt @@ -0,0 +1 @@ +Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular RNNs with two consecutive inputs). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_23.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce09c52fb2fe5ce8f62307d6d6ea0f1e4337fe9b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_23.txt @@ -0,0 +1 @@ +In this context, a segment is a number of consecutive tokens (for instance 512) that may span across multiple documents, and segments are fed in order to the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_24.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..257722be53257008186c101695846b1dbcb496f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_24.txt @@ -0,0 +1 @@ +Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention scores. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_25.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1a6f17cce8927fbaf1c26c12356ecddbaf3aa8c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_25.txt @@ -0,0 +1 @@ +This allows the model to pay attention to information that was in the previous segment as well as the current one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_26.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..b25daf184d68ae9b9bc29e3552c22acbedcbeb23 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_26.txt @@ -0,0 +1 @@ +By stacking multiple attention layers, the receptive field can be increased to multiple previous segments. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_27.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..090e260e350f79f305cb80ffe35505b2b549ae9b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_27.txt @@ -0,0 +1 @@ +This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would give the same results in the current input and the current hidden state at a given position) and needs to make some adjustments in the way attention scores are computed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_28.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a74845985748f8b2c6600391f9885127d6678da --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_28.txt @@ -0,0 +1,41 @@ +TransformerXL does not work with torch.nn.DataParallel due to a bug in PyTorch, see issue #36035 + +Resources + +Text classification task guide +Causal language modeling task guide + +TransfoXLConfig +[[autodoc]] TransfoXLConfig +TransfoXLTokenizer +[[autodoc]] TransfoXLTokenizer + - save_vocabulary +TransfoXL specific outputs +[[autodoc]] models.deprecated.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput +[[autodoc]] models.deprecated.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput +[[autodoc]] models.deprecated.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLModelOutput +[[autodoc]] models.deprecated.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput + +TransfoXLModel +[[autodoc]] TransfoXLModel + - forward +TransfoXLLMHeadModel +[[autodoc]] TransfoXLLMHeadModel + - forward +TransfoXLForSequenceClassification +[[autodoc]] TransfoXLForSequenceClassification + - forward + +TFTransfoXLModel +[[autodoc]] TFTransfoXLModel + - call +TFTransfoXLLMHeadModel +[[autodoc]] TFTransfoXLLMHeadModel + - call +TFTransfoXLForSequenceClassification +[[autodoc]] TFTransfoXLForSequenceClassification + - call + +Internal Layers +[[autodoc]] AdaptiveEmbedding +[[autodoc]] TFAdaptiveEmbedding \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_9.txt b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..214ab82499830a7d5660c4733f0ca1f0d63bd2e4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_transfo-xl/chunk_9.txt @@ -0,0 +1,3 @@ +The abstract from the paper is the following: +Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the +setting of language modeling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_10.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..f77045ee311368b00dbb28323e23513db9353bb9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_10.txt @@ -0,0 +1 @@ +Please refer to the [VisionEncoderDecoder] class on how to use this model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_11.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_12.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f16a20874db68468d36337353044b26ede99569 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_12.txt @@ -0,0 +1,2 @@ +The original code can be found +here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_13.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..324c23f587b3125b266ff43c0fbbe23428bbd931 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_13.txt @@ -0,0 +1,5 @@ +Usage tips + +The quickest way to get started with TrOCR is by checking the tutorial + notebooks, which show how to use the model + at inference time as well as fine-tuning on custom data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_14.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..39b097d3dc5d77787a3c3ca53d55b9853e126206 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_14.txt @@ -0,0 +1 @@ +TrOCR is pre-trained in 2 stages before being fine-tuned on downstream datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_15.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3331cf7ee18878160c19940280cb31cf00a57a74 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_15.txt @@ -0,0 +1,2 @@ +It achieves state-of-the-art results + on both printed (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_16.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..18e047c56311eee1b4beebd1af64625e7cde8f25 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_16.txt @@ -0,0 +1 @@ +the SROIE dataset and handwritten (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_17.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b7da70fb7dcbf6ec34648eea786a988c3d597ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_17.txt @@ -0,0 +1,2 @@ +the IAM + Handwriting dataset text recognition tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_18.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f78472a72030b734746d351b2f738e0cebee5e3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_18.txt @@ -0,0 +1,2 @@ +For more + information, see the official models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_19.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..4168bedbcf47893a51452430cdf24a647d3df20d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_19.txt @@ -0,0 +1 @@ +TrOCR is always used within the VisionEncoderDecoder framework. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_20.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..a439942f8019504d1a722aa329e3a978be548139 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_20.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with TrOCR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_21.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_21.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_22.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_22.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_23.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e1fa971abb866d6fbc0b9b997bf1dce2b2ebbbf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_23.txt @@ -0,0 +1 @@ +A blog post on Accelerating Document AI with TrOCR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_24.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6237de934149b8c7c9e9c62f5aac3e61144685 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_24.txt @@ -0,0 +1 @@ +A blog post on how to Document AI with TrOCR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_25.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..681dd9347015724229480559bc8eb49f8b7bbb87 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_25.txt @@ -0,0 +1 @@ +A notebook on how to finetune TrOCR on IAM Handwriting Database using Seq2SeqTrainer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_26.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..44055aec19cceba089c80937683d0b95683cc680 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_26.txt @@ -0,0 +1 @@ +A notebook on inference with TrOCR and Gradio demo. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_27.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..36e6cb292bd546b2a6cebc47dbab8c7a9e9a55c1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_27.txt @@ -0,0 +1 @@ +A notebook on finetune TrOCR on the IAM Handwriting Database using native PyTorch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_28.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e37b112aee2bb20eeeb7b9416c3de3c0d622931 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_28.txt @@ -0,0 +1 @@ +A notebook on evaluating TrOCR on the IAM test set. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_29.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ff40cbb98c8317139429bb9a54403db2a68e6df --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_29.txt @@ -0,0 +1 @@ +Casual language modeling task guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_30.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee69684172dddf6dd81c07c0c89625e506a62339 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_30.txt @@ -0,0 +1,3 @@ +âš¡ï¸ Inference + +An interactive-demo on TrOCR handwritten character recognition. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_31.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b7c026df1eb19823f3c9cc7cbab70d95b979312 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_31.txt @@ -0,0 +1,3 @@ +Inference +TrOCR's [VisionEncoderDecoder] model accepts images as input and makes use of +[~generation.GenerationMixin.generate] to autoregressively generate text given the input image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_32.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..fcb90f1d33cef8741ba628ad861583ecba95331c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_32.txt @@ -0,0 +1,2 @@ +The [ViTImageProcessor/DeiTImageProcessor] class is responsible for preprocessing the input image and +[RobertaTokenizer/XLMRobertaTokenizer] decodes the generated target tokens to the target string. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_33.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..2660ab19612ee22358b289aa29a27ba8530ff66f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_33.txt @@ -0,0 +1,3 @@ +The +[TrOCRProcessor] wraps [ViTImageProcessor/DeiTImageProcessor] and [RobertaTokenizer/XLMRobertaTokenizer] +into a single instance to both extract the input features and decode the predicted token ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_34.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..12f29e30c0660fe579678ef84e4f9b76a08e4cb2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_34.txt @@ -0,0 +1,17 @@ +Step-by-step Optical Character Recognition (OCR) + +``` py + +from transformers import TrOCRProcessor, VisionEncoderDecoderModel +import requests +from PIL import Image +processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten") +model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten") +load image from the IAM dataset +url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg" +image = Image.open(requests.get(url, stream=True).raw).convert("RGB") +pixel_values = processor(image, return_tensors="pt").pixel_values +generated_ids = model.generate(pixel_values) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] + +See the model hub to look for TrOCR checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_35.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..57fe41f7e23ee544dc64c9a60524120057a32396 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_35.txt @@ -0,0 +1,12 @@ +TrOCRConfig +[[autodoc]] TrOCRConfig +TrOCRProcessor +[[autodoc]] TrOCRProcessor + - call + - from_pretrained + - save_pretrained + - batch_decode + - decode +TrOCRForCausalLM +[[autodoc]] TrOCRForCausalLM + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_7.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..e830375ebeccba94c514b855411f210d2da88166 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_7.txt @@ -0,0 +1,3 @@ +Experiments +show that the TrOCR model outperforms the current state-of-the-art models on both printed and handwritten text recognition +tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_8.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d95163104fa23e267e5c96d2ae7156aa7c2397b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_8.txt @@ -0,0 +1 @@ +TrOCR architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_trocr/chunk_9.txt b/chunked/content_aware_chunking/model_doc_trocr/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_trocr/chunk_9.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_10.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0855031ccafb3810fb19378f278d2607c994037 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by Zineng Tang. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcea4bdec4531a6d9c0a33293b061bb5848542cd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_11.txt @@ -0,0 +1,3 @@ +Usage tips + +TVLT is a model that takes both pixel_values and audio_values as input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac57ca158264d48f164387667d8aa4b2ddc2fde5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_12.txt @@ -0,0 +1 @@ +One can use [TvltProcessor] to prepare data for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..5693ad3ee9426f5f501e04653c14dcc5d9e04760 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_13.txt @@ -0,0 +1 @@ +This processor wraps an image processor (for the image/video modality) and an audio feature extractor (for the audio modality) into one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9442e678b6f2949f087ef15d193791f38dca253 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_14.txt @@ -0,0 +1 @@ +TVLT is trained with images/videos and audios of various sizes: the authors resize and crop the input images/videos to 224 and limit the length of audio spectrogram to 2048. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_15.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..444614a69e4a441b9af12d12eeff3499ec284291 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_15.txt @@ -0,0 +1 @@ +To make batching of videos and audios possible, the authors use a pixel_mask that indicates which pixels are real/padding and audio_mask that indicates which audio values are real/padding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_16.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ef87683811189659bb1621eb9df32333112131c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_16.txt @@ -0,0 +1 @@ +The design of TVLT is very similar to that of a standard Vision Transformer (ViT) and masked autoencoder (MAE) as in ViTMAE. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_17.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..272fe2d04dfd543b9a0cae7d89fc8eebce5c767c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_17.txt @@ -0,0 +1 @@ +The difference is that the model includes embedding layers for the audio modality. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_18.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..66304687e043ed5506837682fa9b133716659337 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_18.txt @@ -0,0 +1 @@ +The PyTorch version of this model is only available in torch 1.10 and higher. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_19.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..e80352c417e0185cdc0c7a9ec416d3e601d04184 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_19.txt @@ -0,0 +1,20 @@ +TvltConfig +[[autodoc]] TvltConfig +TvltProcessor +[[autodoc]] TvltProcessor + - call +TvltImageProcessor +[[autodoc]] TvltImageProcessor + - preprocess +TvltFeatureExtractor +[[autodoc]] TvltFeatureExtractor + - call +TvltModel +[[autodoc]] TvltModel + - forward +TvltForPreTraining +[[autodoc]] TvltForPreTraining + - forward +TvltForAudioVisualClassification +[[autodoc]] TvltForAudioVisualClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_7.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fdc9ad781a1d7de5987c8906eaf2ae8f0620b62 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_7.txt @@ -0,0 +1 @@ +TVLT architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_8.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..62969105c469897bc040ff2b70b6b47004d4534b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_8.txt @@ -0,0 +1 @@ +Taken from the https://arxiv.org/abs/2102.03334">original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvlt/chunk_9.txt b/chunked/content_aware_chunking/model_doc_tvlt/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvlt/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_10.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05734f1dbc3f8ec1dacadd1803d1e84866620ff8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_10.txt @@ -0,0 +1 @@ +TVP involves integrating specially designed patterns, known as 'prompts', into both the visual (image-based) and textual (word-based) input components of a TVG model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_11.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c60bafc801b842f57a5c82b2027eac69fd3331ac --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_11.txt @@ -0,0 +1 @@ +These prompts provide additional spatial-temporal context, improving the model's ability to accurately determine event timings in the video. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_12.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6045f350efc7db7774a8055af38f3379eeba3888 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_12.txt @@ -0,0 +1 @@ +The approach employs 2D visual inputs in place of 3D ones. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_13.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..2906c08f2e026697c44b7bffc59ab9bf861c6284 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_13.txt @@ -0,0 +1 @@ +Although 3D inputs offer more spatial-temporal detail, they are also more time-consuming to process. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_14.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bc282e9859b0b334e26194e6b596ade8638e493 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_14.txt @@ -0,0 +1 @@ +The use of 2D inputs with the prompting method aims to provide similar levels of context and accuracy more efficiently. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_15.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..410191f4575cb8cf9d65f0d905395ce582d81b89 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_15.txt @@ -0,0 +1 @@ +TVP architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_16.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_16.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_17.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..40b18c43ca8ed0bd4d7e596b464183a583c03dcf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_17.txt @@ -0,0 +1 @@ +This model was contributed by Jiqing Feng. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_18.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_18.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_19.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..7616cfe39ffd4e44f03de2e6a54a78e0476c0fae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_19.txt @@ -0,0 +1,2 @@ +Usage tips and examples +Prompts are optimized perturbation patterns, which would be added to input video frames or text features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_20.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..63fd3a5e17abf73efb2dc2de7a1801cd4811a1f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_20.txt @@ -0,0 +1 @@ +Universal set refers to using the same exact set of prompts for any input, this means that these prompts are added consistently to all video frames and text features, regardless of the input's content. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_21.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0693679f4a9293d49044012921731d2a6f8b3e1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_21.txt @@ -0,0 +1 @@ +TVP consists of a visual encoder and cross-modal encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_22.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..df5241475ed4da7006ecd1880979da5d978e53cc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_22.txt @@ -0,0 +1 @@ +A universal set of visual prompts and text prompts to be integrated into sampled video frames and textual features, respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_23.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..2be40e284f4b63b4bf3bc50b1dbce4267a65cbc1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_23.txt @@ -0,0 +1 @@ +Specially, a set of different visual prompts are applied to uniformly-sampled frames of one untrimmed video in order. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_24.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..51c22f4259bdcd23364ba0e4ed822fa3498c1110 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_24.txt @@ -0,0 +1 @@ +The goal of this model is to incorporate trainable prompts into both visual inputs and textual features to temporal video grounding(TVG) problems. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_25.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8c6818abc362ef878020781fd84e1475cff2519 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_25.txt @@ -0,0 +1 @@ +In principle, one can apply any visual, cross-modal encoder in the proposed architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_26.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a5e013e9a6da05c73c966ba51ea9359b2e94acd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_26.txt @@ -0,0 +1,2 @@ +The [TvpProcessor] wraps [BertTokenizer] and [TvpImageProcessor] into a single instance to both +encode the text and prepare the images respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_27.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..263b4fc8efcd6974c17485c3c927d62479a8d1f8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_27.txt @@ -0,0 +1 @@ +The following example shows how to run temporal video grounding using [TvpProcessor] and [TvpForVideoGrounding]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_28.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..9068a988bb9d32b33db69e2ac5cf6f1cc2cec878 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_28.txt @@ -0,0 +1,10 @@ +thon +import av +import cv2 +import numpy as np +import torch +from huggingface_hub import hf_hub_download +from transformers import AutoProcessor, TvpForVideoGrounding +def pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps): + ''' + Convert the video from its original fps to the target_fps and decode the video with PyAV decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_29.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8be2f51ee04d27dba3176410aec2082d3fc63ef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_29.txt @@ -0,0 +1,2 @@ +Args: + container (container): pyav container. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_30.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fdb1964cf5c5506663cfc3ed6452c6daf468166 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_30.txt @@ -0,0 +1 @@ +sampling_rate (int): frame sampling rate (interval between two sampled frames). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_31.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..4974f8a4fb9f46cf1642c906846824828aeb31df --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_31.txt @@ -0,0 +1 @@ +num_frames (int): number of frames to sample. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_32.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e06f87a33f02986d2846cac5f70dbdbeab08f43 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_32.txt @@ -0,0 +1 @@ +clip_idx (int): if clip_idx is -1, perform random temporal sampling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_33.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1e239dbc410d5af26d641cfbdf92ae0a5daa99f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_33.txt @@ -0,0 +1,2 @@ +If clip_idx is larger than -1, uniformly split the video to num_clips + clips, and select the clip_idx-th video clip. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_34.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2ab21b5925130fd0fecd93a08b1c44e4c9e240e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_34.txt @@ -0,0 +1 @@ +num_clips (int): overall number of clips to uniformly sample from the given video. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_35.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..24252aec52842a4e233d932decd1d3a2b2ad8f67 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_35.txt @@ -0,0 +1,2 @@ +target_fps (int): the input video may have different fps, convert it to + the target video fps before frame sampling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_36.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..08f975dd8ffeadb4312b023ecd05d7a09cf47da9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_36.txt @@ -0,0 +1,2 @@ +Returns: + frames (tensor): decoded frames from the video. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_37.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bbe2cdd5224f0c4453b70c12022f8babc15f1e3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_37.txt @@ -0,0 +1,2 @@ +Return None if the no + video stream was found. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_38.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee97896a3b1d9fcea5712495bd98e4f09f428d6b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_38.txt @@ -0,0 +1,2 @@ +fps (float): the number of frames per second of the video. + ''' \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_39.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb450172639868b60cdf162e969ee1eba3b1b960 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_39.txt @@ -0,0 +1,23 @@ +video = container.streams.video[0] + fps = float(video.average_rate) + clip_size = sampling_rate * num_frames / target_fps * fps + delta = max(num_frames - clip_size, 0) + start_idx = delta * clip_idx / num_clips + end_idx = start_idx + clip_size - 1 + timebase = video.duration / num_frames + video_start_pts = int(start_idx * timebase) + video_end_pts = int(end_idx * timebase) + seek_offset = max(video_start_pts - 1024, 0) + container.seek(seek_offset, any_frame=False, backward=True, stream=video) + frames = {} + for frame in container.decode(video=0): + if frame.pts < video_start_pts: + continue + frames[frame.pts] = frame + if frame.pts > video_end_pts: + break + frames = [frames[pts] for pts in sorted(frames)] + return frames, fps +def decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps): + ''' + Decode the video and perform temporal sampling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_40.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8be2f51ee04d27dba3176410aec2082d3fc63ef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_40.txt @@ -0,0 +1,2 @@ +Args: + container (container): pyav container. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_41.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fdb1964cf5c5506663cfc3ed6452c6daf468166 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_41.txt @@ -0,0 +1 @@ +sampling_rate (int): frame sampling rate (interval between two sampled frames). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_42.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..4974f8a4fb9f46cf1642c906846824828aeb31df --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_42.txt @@ -0,0 +1 @@ +num_frames (int): number of frames to sample. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_43.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e06f87a33f02986d2846cac5f70dbdbeab08f43 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_43.txt @@ -0,0 +1 @@ +clip_idx (int): if clip_idx is -1, perform random temporal sampling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_44.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1e239dbc410d5af26d641cfbdf92ae0a5daa99f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_44.txt @@ -0,0 +1,2 @@ +If clip_idx is larger than -1, uniformly split the video to num_clips + clips, and select the clip_idx-th video clip. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_45.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2ab21b5925130fd0fecd93a08b1c44e4c9e240e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_45.txt @@ -0,0 +1 @@ +num_clips (int): overall number of clips to uniformly sample from the given video. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_46.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..24252aec52842a4e233d932decd1d3a2b2ad8f67 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_46.txt @@ -0,0 +1,2 @@ +target_fps (int): the input video may have different fps, convert it to + the target video fps before frame sampling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_47.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..47d2694f7b9ff31626859d42d74439f1ff9abbb4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_47.txt @@ -0,0 +1,3 @@ +Returns: + frames (tensor): decoded frames from the video. + ''' \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_48.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cbd7dda62dc6ad33ed9ad88952da63388b089f5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_48.txt @@ -0,0 +1,20 @@ +assert clip_idx >= -2, "Not a valied clip_idx {}".format(clip_idx) + frames, fps = pyav_decode(container, sampling_rate, num_frames, clip_idx, num_clips, target_fps) + clip_size = sampling_rate * num_frames / target_fps * fps + index = np.linspace(0, clip_size - 1, num_frames) + index = np.clip(index, 0, len(frames) - 1).astype(np.int64) + frames = np.array([frames[idx].to_rgb().to_ndarray() for idx in index]) + frames = frames.transpose(0, 3, 1, 2) + return frames +file = hf_hub_download(repo_id="Intel/tvp_demo", filename="AK2KG.mp4", repo_type="dataset") +model = TvpForVideoGrounding.from_pretrained("Intel/tvp-base") +decoder_kwargs = dict( + container=av.open(file, metadata_errors="ignore"), + sampling_rate=1, + num_frames=model.config.num_frames, + clip_idx=0, + num_clips=1, + target_fps=3, +) +raw_sampled_frms = decode(**decoder_kwargs) +text = "a person is sitting on a bed." \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_49.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..8362b3c678c6dea0ea27eb27565b179c9095f46e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_49.txt @@ -0,0 +1,21 @@ +processor = AutoProcessor.from_pretrained("Intel/tvp-base") +model_inputs = processor( + text=[text], videos=list(raw_sampled_frms), return_tensors="pt", max_text_length=100#, size=size +) +model_inputs["pixel_values"] = model_inputs["pixel_values"].to(model.dtype) +output = model(**model_inputs) +def get_video_duration(filename): + cap = cv2.VideoCapture(filename) + if cap.isOpened(): + rate = cap.get(5) + frame_num = cap.get(7) + duration = frame_num/rate + return duration + return -1 +duration = get_video_duration(file) +start, end = processor.post_process_video_grounding(output.logits, duration) +print(f"The time slot of the video corresponding to the text \"{text}\" is from {start}s to {end}s") + +Tips: + +This implementation of TVP uses [BertTokenizer] to generate text embeddings and Resnet-50 model to compute visual embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_50.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..200dcf8571741d627f0857ae37e0b5c8df702394 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_50.txt @@ -0,0 +1 @@ +Checkpoints for pre-trained tvp-base is released. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_51.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c7e46ab5c86813dafd5e671fe79fbb851f9e3f5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_51.txt @@ -0,0 +1 @@ +Please refer to Table 2 for TVP's performance on Temporal Video Grounding task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_tvp/chunk_52.txt b/chunked/content_aware_chunking/model_doc_tvp/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbf15b9ebb93fb25d45c4cdd9d3df5d1982969e6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_tvp/chunk_52.txt @@ -0,0 +1,14 @@ +TvpConfig +[[autodoc]] TvpConfig +TvpImageProcessor +[[autodoc]] TvpImageProcessor + - preprocess +TvpProcessor +[[autodoc]] TvpProcessor + - call +TvpModel +[[autodoc]] TvpModel + - forward +TvpForVideoGrounding +[[autodoc]] TvpForVideoGrounding + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ul2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_ul2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..e16cf7bae161761a5061703757b9cdd1df41640f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ul2/chunk_10.txt @@ -0,0 +1 @@ +Our model also achieve strong results at in-context learning, outperforming 175B GPT-3 on zero-shot SuperGLUE and tripling the performance of T5-XXL on one-shot summarization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ul2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_ul2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d360f90442d62393d1485d8bd00693c41ae52730 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ul2/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by DanielHesslow. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ul2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_ul2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ul2/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ul2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_ul2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8f2c6461ab9f6414fc1c91c675df42290293de0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ul2/chunk_13.txt @@ -0,0 +1,3 @@ +Usage tips + +UL2 is an encoder-decoder model pre-trained on a mixture of denoising functions as well as fine-tuned on an array of downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ul2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_ul2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7537b750528f879f7827555315289f1b98bad02 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ul2/chunk_14.txt @@ -0,0 +1 @@ +UL2 has the same architecture as T5v1.1 but uses the Gated-SiLU activation function instead of Gated-GELU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ul2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_ul2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1857fdb6d0273aee20644914d6cfced686c7d5d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ul2/chunk_15.txt @@ -0,0 +1,4 @@ +The authors release checkpoints of one architecture which can be seen here + + +As UL2 has the same architecture as T5v1.1, refer to T5's documentation page for API reference, tips, code examples and notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ul2/chunk_6.txt b/chunked/content_aware_chunking/model_doc_ul2/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e59cab1fd86e752410bebb0a0e54a47bea38f88 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ul2/chunk_6.txt @@ -0,0 +1 @@ +We then propose Mixture-of-Denoisers (MoD), a pre-training objective that combines diverse pre-training paradigms together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ul2/chunk_7.txt b/chunked/content_aware_chunking/model_doc_ul2/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..9188182aaa4a30a7ad3ef41708fb43fb6a6672ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ul2/chunk_7.txt @@ -0,0 +1 @@ +We furthermore introduce a notion of mode switching, wherein downstream fine-tuning is associated with specific pre-training schemes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ul2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_ul2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..7641dd9ccc53ce684466868ed016884589e1a3b1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ul2/chunk_8.txt @@ -0,0 +1 @@ +We conduct extensive ablative experiments to compare multiple pre-training objectives and find that our method pushes the Pareto-frontier by outperforming T5 and/or GPT-like models across multiple diverse setups. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_ul2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_ul2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0dd0e71442dc30ea5862ec1e1162bddcf495d55 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_ul2/chunk_9.txt @@ -0,0 +1 @@ +Finally, by scaling our model up to 20B parameters, we achieve SOTA performance on 50 well-established supervised NLP tasks ranging from language generation (with automated and human evaluation), language understanding, text classification, question answering, commonsense reasoning, long text reasoning, structured knowledge grounding and information retrieval. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_10.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..a29b256b44e551d930ed40eea82ed044d40724b2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_10.txt @@ -0,0 +1,3 @@ +Usage tips + +UMT5 was only pre-trained on mC4 excluding any supervised training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_11.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccca8a3540563470a22f72ed41e4a10b2365dfe6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_11.txt @@ -0,0 +1 @@ +Therefore, this model has to be fine-tuned before it is usable on a downstream task, unlike the original T5 model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_12.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..f20ce12b89cae1948c890a2277d8e6610e877d0e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_12.txt @@ -0,0 +1,2 @@ +Since umT5 was pre-trained in an unsupervised manner, there's no real advantage to using a task prefix during single-task +fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_13.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..b02171176dfea8686a381c899b340100c72beb4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_13.txt @@ -0,0 +1 @@ +If you are doing multi-task fine-tuning, you should use a prefix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_14.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9ef6b188b225951978f4e3898881bdfec9a6761 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_14.txt @@ -0,0 +1 @@ +Differences with mT5? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_15.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f058e5d6c6ce31a147fad3f12c336a237eb2711 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_15.txt @@ -0,0 +1 @@ +UmT5 is based on mT5, with a non-shared relative positional bias that is computed for each layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_16.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..57d604c26cf3b3f985b674a117459ea5bc73da93 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_16.txt @@ -0,0 +1 @@ +This means that the model set has_relative_bias for each layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_17.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6059ba7b8fec0fbb6222c484bf720e087a81caa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_17.txt @@ -0,0 +1 @@ +The conversion script is also different because the model was saved in t5x's latest checkpointing format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_18.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..75b6fc38f36ccd8a1db6d37f669f6e97aee59315 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_18.txt @@ -0,0 +1,8 @@ +Sample usage +thon + +from transformers import AutoModelForSeq2SeqLM, AutoTokenizer +model = AutoModelForSeq2SeqLM.from_pretrained("google/umt5-small") +tokenizer = AutoTokenizer.from_pretrained("google/umt5-small") +inputs = tokenizer( + "A walks into a bar and orders a with pinch of . \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_19.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c5b1d3545aa7a64072639523f1ab0d34b77e99f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_19.txt @@ -0,0 +1,6 @@ +", + return_tensors="pt", + ) +outputs = model.generate(**inputs) +print(tokenizer.batch_decode(outputs)) +['nyone who drink a alcohol A A. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_20.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..74a84106f964873a2eef0eb5ba25252e4bf92e5e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_20.txt @@ -0,0 +1,4 @@ +This I'] + + +Refer to T5's documentation page for more tips, code examples and notebooks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_21.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..175845b0f60e7b2b3a508b729ce68ef371db4ee4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_21.txt @@ -0,0 +1,20 @@ +UMT5Config +[[autodoc]] UMT5Config +UMT5Model +[[autodoc]] UMT5Model + - forward +UMT5ForConditionalGeneration +[[autodoc]] UMT5ForConditionalGeneration + - forward +UMT5EncoderModel +[[autodoc]] UMT5EncoderModel + - forward +UMT5ForSequenceClassification +[[autodoc]] UMT5ForSequenceClassification + - forward +UMT5ForTokenClassification +[[autodoc]] UMT5ForTokenClassification + - forward +UMT5ForQuestionAnswering +[[autodoc]] UMT5ForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_8.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0a055466462652337704ebe7eeb7a3193dfc779 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by agemagician and stefan-it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_umt5/chunk_9.txt b/chunked/content_aware_chunking/model_doc_umt5/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf67fffa4f5d9001fd37fd419894f4c6591262b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_umt5/chunk_9.txt @@ -0,0 +1,2 @@ +The original code can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_10.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b4a00afdd6bb3f5bb7b0278d304cf9cc64f46f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_10.txt @@ -0,0 +1,2 @@ +Finally, we scale up training +dataset to 94 thousand hours public audio data and achieve further performance improvement in all SUPERB tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_11.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8d9a4a80c39655451ecdeff278033b32e2fe584 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_11.txt @@ -0,0 +1 @@ +This model was contributed by patrickvonplaten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_12.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ad58ddb8a8cff91286d83f324a8951d220fafcb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_12.txt @@ -0,0 +1,2 @@ +The Authors' code can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_13.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fca00b04ae313c8d09f6f914506f329a1c2a218 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_13.txt @@ -0,0 +1,3 @@ +Usage tips + +UniSpeechSat is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_14.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a90c5c4d359746e768db1d6bddc08460fc0811b7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_14.txt @@ -0,0 +1 @@ +Please use [Wav2Vec2Processor] for the feature extraction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_15.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..16b73555e60957b7a6b54591e987acdba6ccd7b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_15.txt @@ -0,0 +1,2 @@ +UniSpeechSat model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be + decoded using [Wav2Vec2CTCTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_16.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3fe7380055a51ba3c87e4cd6fd995d08bebfc4f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_16.txt @@ -0,0 +1 @@ +UniSpeechSat performs especially well on speaker verification, speaker identification, and speaker diarization tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_17.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..076493690923c99f76babb5f8b9f99afa8d888a4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_17.txt @@ -0,0 +1,27 @@ +Resources + +Audio classification task guide +Automatic speech recognition task guide + +UniSpeechSatConfig +[[autodoc]] UniSpeechSatConfig +UniSpeechSat specific outputs +[[autodoc]] models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatForPreTrainingOutput +UniSpeechSatModel +[[autodoc]] UniSpeechSatModel + - forward +UniSpeechSatForCTC +[[autodoc]] UniSpeechSatForCTC + - forward +UniSpeechSatForSequenceClassification +[[autodoc]] UniSpeechSatForSequenceClassification + - forward +UniSpeechSatForAudioFrameClassification +[[autodoc]] UniSpeechSatForAudioFrameClassification + - forward +UniSpeechSatForXVector +[[autodoc]] UniSpeechSatForXVector + - forward +UniSpeechSatForPreTraining +[[autodoc]] UniSpeechSatForPreTraining + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_5.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..32896b505fb38f6d696ce332f6c7599178295f95 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_5.txt @@ -0,0 +1,2 @@ +First, we apply the multi-task learning to +the current SSL framework, where we integrate the utterance-wise contrastive loss with the SSL objective function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_6.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..dee2391304182b6e3041514ae912454fe0772cb9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_6.txt @@ -0,0 +1,2 @@ +Second, for better speaker discrimination, we propose an utterance mixing strategy for data augmentation, where +additional overlapped utterances are created unsupervisedly and incorporate during training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_7.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..af09565829705535ae837d00bf69c819ae75c1c9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_7.txt @@ -0,0 +1,2 @@ +We integrate the proposed +methods into the HuBERT framework. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_8.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d987b39963aaae15514c9f53d53515448f6b060 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_8.txt @@ -0,0 +1,3 @@ +Experiment results on SUPERB benchmark show that the proposed system achieves +state-of-the-art performance in universal representation learning, especially for speaker identification oriented +tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_9.txt b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e3261a8a0ae80ff82bd087dd34d7375ad6b1388 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech-sat/chunk_9.txt @@ -0,0 +1 @@ +An ablation study is performed verifying the efficacy of each proposed method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech/chunk_10.txt b/chunked/content_aware_chunking/model_doc_unispeech/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0b51df876584ce690c9932f6e3ae8fb859bf373 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech/chunk_10.txt @@ -0,0 +1,2 @@ +UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be + decoded using [Wav2Vec2CTCTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech/chunk_11.txt b/chunked/content_aware_chunking/model_doc_unispeech/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..7142c7508696f3404c35af0012a4d54f14969e9b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech/chunk_11.txt @@ -0,0 +1,21 @@ +Resources + +Audio classification task guide +Automatic speech recognition task guide + +UniSpeechConfig +[[autodoc]] UniSpeechConfig +UniSpeech specific outputs +[[autodoc]] models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput +UniSpeechModel +[[autodoc]] UniSpeechModel + - forward +UniSpeechForCTC +[[autodoc]] UniSpeechForCTC + - forward +UniSpeechForSequenceClassification +[[autodoc]] UniSpeechForSequenceClassification + - forward +UniSpeechForPreTraining +[[autodoc]] UniSpeechForPreTraining + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech/chunk_6.txt b/chunked/content_aware_chunking/model_doc_unispeech/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8d9a4a80c39655451ecdeff278033b32e2fe584 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech/chunk_6.txt @@ -0,0 +1 @@ +This model was contributed by patrickvonplaten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech/chunk_7.txt b/chunked/content_aware_chunking/model_doc_unispeech/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ad58ddb8a8cff91286d83f324a8951d220fafcb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech/chunk_7.txt @@ -0,0 +1,2 @@ +The Authors' code can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech/chunk_8.txt b/chunked/content_aware_chunking/model_doc_unispeech/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..975766c9b6f5316737b677e7cfccf31f24867a6a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech/chunk_8.txt @@ -0,0 +1,3 @@ +Usage tips + +UniSpeech is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_unispeech/chunk_9.txt b/chunked/content_aware_chunking/model_doc_unispeech/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..13c4bf4e938d3b18100f14bfa56b4c9a74639d33 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_unispeech/chunk_9.txt @@ -0,0 +1,2 @@ +Please + use [Wav2Vec2Processor] for the feature extraction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..2dffa8e4431e15d69123c6f317e24ac0a66602c0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_10.txt @@ -0,0 +1 @@ +Inspired by works in the field of voice activity detection, we added a multi-resolution spectrogram discriminator that employs multiple linear spectrogram magnitudes computed using various parameter sets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..df4665071d8d3219aad363bee24f2f32aef91380 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_11.txt @@ -0,0 +1 @@ +Using full-band mel-spectrograms as input, we expect to generate high-resolution signals by adding a discriminator that employs spectrograms of multiple resolutions as the input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_12.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..405f6d4107d55602dba9f9a3bd971c51a01d3c46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_12.txt @@ -0,0 +1 @@ +In an evaluation on a dataset containing information on hundreds of speakers, UnivNet obtained the best objective and subjective results among competing models for both seen and unseen speakers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_13.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9ca7453b81c423ea172c3c74aea3a34f0fe2b87 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_13.txt @@ -0,0 +1 @@ +These results, including the best subjective score for text-to-speech, demonstrate the potential for fast adaptation to new speakers without a need for training from scratch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_14.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..b02414d49a27d3af84fcf7ba6447e8eaa9d1bcc8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_14.txt @@ -0,0 +1,3 @@ +Tips: + +The noise_sequence argument for [UnivNetModel.forward] should be standard Gaussian noise (such as from torch.randn) of shape ([batch_size], noise_length, model.config.model_in_channels), where noise_length should match the length dimension (dimension 1) of the input_features argument. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_15.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b49eba1cd02bf2dd7897b0ccf9041b145b9c9583 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_15.txt @@ -0,0 +1 @@ +If not supplied, it will be randomly generated; a torch.Generator can be supplied to the generator argument so that the forward pass can be reproduced. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_16.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..bec73087996a36894b5420109c9419b86e51677f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_16.txt @@ -0,0 +1 @@ +(Note that [UnivNetFeatureExtractor] will return generated noise by default, so it shouldn't be necessary to generate noise_sequence manually.) \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_17.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..5bb8e6405011a1602bb878d21e523fc10501c90e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_17.txt @@ -0,0 +1 @@ +Padding added by [UnivNetFeatureExtractor] can be removed from the [UnivNetModel] output through the [UnivNetFeatureExtractor.batch_decode] method, as shown in the usage example below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_18.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5a953f5864ba0d970bcaf44f85b1f0aa1c31ecf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_18.txt @@ -0,0 +1 @@ +Padding the end of each waveform with silence can reduce artifacts at the end of the generated audio sample. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_19.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..1815db6091428dccb14fd589de78ff053f4d5d05 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_19.txt @@ -0,0 +1 @@ +This can be done by supplying pad_end = True to [UnivNetFeatureExtractor.__call__]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_20.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..44c6b204ccfd01b7e64ac74a4bb51831b849bf84 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_20.txt @@ -0,0 +1 @@ +See this issue for more details. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_21.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..530bff51ecb84eccd3e724e0b567000baf3c1ada --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_21.txt @@ -0,0 +1,11 @@ +Usage Example: +thon +import torch +from scipy.io.wavfile import write +from datasets import Audio, load_dataset +from transformers import UnivNetFeatureExtractor, UnivNetModel +model_id_or_path = "dg845/univnet-dev" +model = UnivNetModel.from_pretrained(model_id_or_path) +feature_extractor = UnivNetFeatureExtractor.from_pretrained(model_id_or_path) +ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +Resample the audio to the model and feature extractor's sampling rate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_22.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..aef4a415400b0d0de1d1f808531abdfb8e909ef8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_22.txt @@ -0,0 +1,2 @@ +ds = ds.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate)) +Pad the end of the converted waveforms to reduce artifacts at the end of the output audio samples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_23.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..61362b5cd3b6120cd62cb4dbddde374cda5269ee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_23.txt @@ -0,0 +1,6 @@ +inputs = feature_extractor( + ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], pad_end=True, return_tensors="pt" +) +with torch.no_grad(): + audio = model(**inputs) +Remove the extra padding at the end of the output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_24.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..66c13d0b50f082150cf839a431bb8267e3829d10 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_24.txt @@ -0,0 +1,5 @@ +audio = feature_extractor.batch_decode(**audio)[0] +Convert to wav file +write("sample_audio.wav", feature_extractor.sampling_rate, audio) + +This model was contributed by dg845. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_25.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a21aa4e4be779d538a272bfcd1f4a3666322956 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_25.txt @@ -0,0 +1 @@ +To the best of my knowledge, there is no official code release, but an unofficial implementation can be found at maum-ai/univnet with pretrained checkpoints here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_26.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b2291aa8275bef80f7060e93b487c26eaa8af85 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_26.txt @@ -0,0 +1,8 @@ +UnivNetConfig +[[autodoc]] UnivNetConfig +UnivNetFeatureExtractor +[[autodoc]] UnivNetFeatureExtractor + - call +UnivNetModel +[[autodoc]] UnivNetModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_univnet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_univnet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..a60c1e399676b589d88a2d2d20356ef1a46665ce --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_univnet/chunk_9.txt @@ -0,0 +1 @@ +To address this problem, we propose UnivNet, a neural vocoder that synthesizes high-fidelity waveforms in real time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd41591cbe6a3c7ad2a8e7be9cd334587723afff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_10.txt @@ -0,0 +1 @@ +The original code is based on OpenMMLab's mmsegmentation here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c3fe9d1a040a88c95872bcadefc522015cc122e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_11.txt @@ -0,0 +1,2 @@ +Usage examples +UPerNet is a general framework for semantic segmentation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_12.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c55c320a40187310938be4e426004494156cac0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_12.txt @@ -0,0 +1,15 @@ +It can be used with any vision backbone, like so: + +from transformers import SwinConfig, UperNetConfig, UperNetForSemanticSegmentation +backbone_config = SwinConfig(out_features=["stage1", "stage2", "stage3", "stage4"]) +config = UperNetConfig(backbone_config=backbone_config) +model = UperNetForSemanticSegmentation(config) + +To use another vision backbone, like ConvNeXt, simply instantiate the model with the appropriate backbone: + +from transformers import ConvNextConfig, UperNetConfig, UperNetForSemanticSegmentation +backbone_config = ConvNextConfig(out_features=["stage1", "stage2", "stage3", "stage4"]) +config = UperNetConfig(backbone_config=backbone_config) +model = UperNetForSemanticSegmentation(config) + +Note that this will randomly initialize all the weights of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_13.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc7a06d39e255ea01c52e51f98855f68bbf20cb3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_13.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with UPerNet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_14.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..5782955793076a3e900498061c1bee4d434e22fd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_14.txt @@ -0,0 +1 @@ +Demo notebooks for UPerNet can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_15.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b221ce2e5482f44780d8a7c498cdab1d9f30550 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_15.txt @@ -0,0 +1 @@ +[UperNetForSemanticSegmentation] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_16.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..867499d48a6e7b7985f656b58a0bdb0d14fc1320 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_16.txt @@ -0,0 +1,3 @@ +See also: Semantic segmentation task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_17.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_17.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_18.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..893ee8067b7614891aa347087f28139404231014 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_18.txt @@ -0,0 +1,5 @@ +UperNetConfig +[[autodoc]] UperNetConfig +UperNetForSemanticSegmentation +[[autodoc]] UperNetForSemanticSegmentation + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_5.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a5a67a025850ee2fbbee56c3b10dd32ebd5df16 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_5.txt @@ -0,0 +1 @@ +We benchmark our framework on Unified Perceptual Parsing and show that it is able to effectively segment a wide range of concepts from images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_6.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..23efc3679e3d35a889c42186f94456b297b30e4f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_6.txt @@ -0,0 +1 @@ +The trained networks are further applied to discover visual knowledge in natural scenes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..2626cb2b862ce203d7c1cf532d9219869a937056 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_7.txt @@ -0,0 +1 @@ +UPerNet framework. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_8.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_upernet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_upernet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_upernet/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_10.txt b/chunked/content_aware_chunking/model_doc_van/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..31fc7f33fae502068b3c21953d398dffbb25c135 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_10.txt @@ -0,0 +1 @@ +(2) The quadratic complexity is too expensive for high-resolution images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_11.txt b/chunked/content_aware_chunking/model_doc_van/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1c91145a2f997a0ea7606a2e596372924ac189f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_11.txt @@ -0,0 +1 @@ +(3) It only captures spatial adaptability but ignores channel adaptability. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_12.txt b/chunked/content_aware_chunking/model_doc_van/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..c691a3a74cf33bc39aee30b31a0c8b0d7a0c7550 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_12.txt @@ -0,0 +1 @@ +In this paper, we propose a novel large kernel attention (LKA) module to enable self-adaptive and long-range correlations in self-attention while avoiding the above issues. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_13.txt b/chunked/content_aware_chunking/model_doc_van/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..432fc96f0c76ab39c7ace22fdca94b778e82fcf2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_13.txt @@ -0,0 +1 @@ +We further introduce a novel neural network based on LKA, namely Visual Attention Network (VAN). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_14.txt b/chunked/content_aware_chunking/model_doc_van/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ab71fe518a9bc2793ba476e727b946b3789bed9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_14.txt @@ -0,0 +1 @@ +While extremely simple, VAN outperforms the state-of-the-art vision transformers and convolutional neural networks with a large margin in extensive experiments, including image classification, object detection, semantic segmentation, instance segmentation, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_15.txt b/chunked/content_aware_chunking/model_doc_van/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..97763b6e911107c5d35dc0ae3d808b18a9976bc0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_15.txt @@ -0,0 +1 @@ +Code is available at this https URL. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_16.txt b/chunked/content_aware_chunking/model_doc_van/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bc27f424d8034fce414ae8fa5ea4c89bb2c360b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_16.txt @@ -0,0 +1,3 @@ +Tips: + +VAN does not have an embedding layer, thus the hidden_states will have a length equal to the number of stages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_17.txt b/chunked/content_aware_chunking/model_doc_van/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f202cc576f682ee661ac6d5560bb227cd05c1a7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_17.txt @@ -0,0 +1 @@ +The figure below illustrates the architecture of a Visual Attention Layer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_18.txt b/chunked/content_aware_chunking/model_doc_van/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_18.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_19.txt b/chunked/content_aware_chunking/model_doc_van/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..593cf6dd0ebb63de7187a7a416ba0ce3c13e685e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_19.txt @@ -0,0 +1 @@ +This model was contributed by Francesco. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_20.txt b/chunked/content_aware_chunking/model_doc_van/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_20.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_21.txt b/chunked/content_aware_chunking/model_doc_van/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4604d959d2f59c5b488ce4d3374633e012c08fd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_21.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VAN. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_22.txt b/chunked/content_aware_chunking/model_doc_van/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..a48c830e989a21573fd96ff5749bb9a9b6b55af8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_22.txt @@ -0,0 +1 @@ +[VanForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_23.txt b/chunked/content_aware_chunking/model_doc_van/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_23.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_24.txt b/chunked/content_aware_chunking/model_doc_van/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_24.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_25.txt b/chunked/content_aware_chunking/model_doc_van/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..31ec04bb4ffd8dc87cb55b143793e3cc95c49b57 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_25.txt @@ -0,0 +1,8 @@ +VanConfig +[[autodoc]] VanConfig +VanModel +[[autodoc]] VanModel + - forward +VanForImageClassification +[[autodoc]] VanForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_7.txt b/chunked/content_aware_chunking/model_doc_van/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc1f07f55d215b8c491fc21dda8eb93e1da95b28 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_7.txt @@ -0,0 +1,2 @@ +The abstract from the paper is the following: +While originally designed for natural language processing tasks, the self-attention mechanism has recently taken various computer vision areas by storm. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_8.txt b/chunked/content_aware_chunking/model_doc_van/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e52e488dc1ae2930a23f2a9aef5049986539e9c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_8.txt @@ -0,0 +1 @@ +However, the 2D nature of images brings three challenges for applying self-attention in computer vision. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_van/chunk_9.txt b/chunked/content_aware_chunking/model_doc_van/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..f38209ceca653233945c19ce79400b016e77fe15 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_van/chunk_9.txt @@ -0,0 +1 @@ +(1) Treating images as 1D sequences neglects their 2D structures. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_10.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..22786609022589c0adc03f4499945edc56c06b3f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_10.txt @@ -0,0 +1 @@ +(3) VideoMAE shows that data quality is more important than data quantity for SSVP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_11.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..67ac4d8751dee20dcad2955232795d97cbaf3fba --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_11.txt @@ -0,0 +1 @@ +Domain shift between pre-training and target datasets are important issues in SSVP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_12.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6a72d9faeb5bc5da9d8781c40fa13473a6392c1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_12.txt @@ -0,0 +1 @@ +Notably, our VideoMAE with the vanilla ViT backbone can achieve 83.9% on Kinects-400, 75.3% on Something-Something V2, 90.8% on UCF101, and 61.1% on HMDB51 without using any extra data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_13.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b13215f1ce940f505776758321c554ebad268b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_13.txt @@ -0,0 +1 @@ +VideoMAE pre-training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_14.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_14.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_15.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_15.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_16.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_16.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_17.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..49726478683426cb2600d00690164d2cd392bfcf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_17.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with VideoMAE. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_18.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f4d8f108e665d89d870ebbc688fb15750996439 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_18.txt @@ -0,0 +1,3 @@ +If +you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll +review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_19.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_19.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_20.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..3cc1b83e59b76933910d02e079ef64e966fc02c6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_20.txt @@ -0,0 +1,3 @@ +Video classification +- A notebook that shows how +to fine-tune a VideoMAE model on a custom dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_21.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..c00684f4c0f06a03187d89de1a96f6fc9ae03996 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_21.txt @@ -0,0 +1,2 @@ +- Video classification task guide +- A 🤗 Space showing how to perform inference with a video classification model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_22.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fb48dbfcb2d81f58ad818d46b7698490d6022f7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_22.txt @@ -0,0 +1,13 @@ +VideoMAEConfig +[[autodoc]] VideoMAEConfig +VideoMAEFeatureExtractor +[[autodoc]] VideoMAEFeatureExtractor + - call +VideoMAEImageProcessor +[[autodoc]] VideoMAEImageProcessor + - preprocess +VideoMAEModel +[[autodoc]] VideoMAEModel + - forward +VideoMAEForPreTraining +VideoMAEForPreTraining includes the decoder on top for self-supervised pre-training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_23.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f2f9425b91fef398331e159d1551159712e692c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_23.txt @@ -0,0 +1,5 @@ +[[autodoc]] transformers.VideoMAEForPreTraining + - forward +VideoMAEForVideoClassification +[[autodoc]] transformers.VideoMAEForVideoClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_7.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffa23751afbe39c3ada19d3960ac4f1e6c656fda --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_7.txt @@ -0,0 +1 @@ +The temporally redundant video content enables higher masking ratio than that of images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_8.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6690a04ecaf0bc2e76b69cfedf7a31b30ec76e7c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_8.txt @@ -0,0 +1 @@ +(2) VideoMAE achieves impressive results on very small datasets (i.e., around 3k-4k videos) without using any extra data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_videomae/chunk_9.txt b/chunked/content_aware_chunking/model_doc_videomae/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..5399f4a8ba84733ab9b1c42af732acd98bc46c63 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_videomae/chunk_9.txt @@ -0,0 +1 @@ +This is partially ascribed to the challenging task of video reconstruction to enforce high-level structure learning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_10.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_11.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..9dd093cfc29362ba32b8d46c4215ab89263222a4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_11.txt @@ -0,0 +1,4 @@ +Usage tips + +The quickest way to get started with ViLT is by checking the example notebooks + (which showcase both inference and fine-tuning on custom data). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_12.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2c69e857cd0c015197c7503016406947a14d714 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_12.txt @@ -0,0 +1 @@ +ViLT is a model that takes both pixel_values and input_ids as input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_13.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fe2095a7cc860462dd8f63c9fd8827bc67e8122 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_13.txt @@ -0,0 +1 @@ +One can use [ViltProcessor] to prepare data for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_14.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..87b16e5f042b20db2a5f7e8e4909d0352032aaa1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_14.txt @@ -0,0 +1 @@ +This processor wraps a image processor (for the image modality) and a tokenizer (for the language modality) into one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_15.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..32bd321b78a64da564b8c2d1691174aede57f542 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_15.txt @@ -0,0 +1,2 @@ +ViLT is trained with images of various sizes: the authors resize the shorter edge of input images to 384 and limit the longer edge to + under 640 while preserving the aspect ratio. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_16.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..22486d6e14dd25086a715a45b3c5c1bf96356551 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_16.txt @@ -0,0 +1,2 @@ +To make batching of images possible, the authors use a pixel_mask that indicates + which pixel values are real and which are padding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_17.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec63ec2eaeb52a8144352d143f0dc728aefd76e2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_17.txt @@ -0,0 +1 @@ +[ViltProcessor] automatically creates this for you. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_18.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8986c73c68736e8f17af18431520b829ab688ec4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_18.txt @@ -0,0 +1 @@ +The design of ViLT is very similar to that of a standard Vision Transformer (ViT). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_19.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..45d66c0718cd4e812c02b9e4eda9bb7615464087 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_19.txt @@ -0,0 +1,2 @@ +The only difference is that the model includes + additional embedding layers for the language modality. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_20.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..66304687e043ed5506837682fa9b133716659337 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_20.txt @@ -0,0 +1 @@ +The PyTorch version of this model is only available in torch 1.10 and higher. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_21.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..056f3388374433f6bd10c0446e2566d1338cc08f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_21.txt @@ -0,0 +1,29 @@ +ViltConfig +[[autodoc]] ViltConfig +ViltFeatureExtractor +[[autodoc]] ViltFeatureExtractor + - call +ViltImageProcessor +[[autodoc]] ViltImageProcessor + - preprocess +ViltProcessor +[[autodoc]] ViltProcessor + - call +ViltModel +[[autodoc]] ViltModel + - forward +ViltForMaskedLM +[[autodoc]] ViltForMaskedLM + - forward +ViltForQuestionAnswering +[[autodoc]] ViltForQuestionAnswering + - forward +ViltForImagesAndTextClassification +[[autodoc]] ViltForImagesAndTextClassification + - forward +ViltForImageAndTextRetrieval +[[autodoc]] ViltForImageAndTextRetrieval + - forward +ViltForTokenClassification +[[autodoc]] ViltForTokenClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_8.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_8.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vilt/chunk_9.txt b/chunked/content_aware_chunking/model_doc_vilt/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vilt/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vipllava/chunk_10.txt b/chunked/content_aware_chunking/model_doc_vipllava/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2d90de20e0486d0052162cfa107e3bc8351ad45 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vipllava/chunk_10.txt @@ -0,0 +1 @@ +We advise users to use padding_side="left" when computing batched generation as it leads to more accurate results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vipllava/chunk_11.txt b/chunked/content_aware_chunking/model_doc_vipllava/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ef3d9de7ea075c290668be11d569e1b2469e319 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vipllava/chunk_11.txt @@ -0,0 +1 @@ +Simply make sure to call processor.tokenizer.padding_side = "left" before generating. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vipllava/chunk_12.txt b/chunked/content_aware_chunking/model_doc_vipllava/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ce634499e3cf81b02d05e767a50200c8eb0537d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vipllava/chunk_12.txt @@ -0,0 +1 @@ +Note the model has not been explicitly trained to process multiple images in the same prompt, although this is technically possible, you may experience inaccurate results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vipllava/chunk_13.txt b/chunked/content_aware_chunking/model_doc_vipllava/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dab3ed60cb34056e313b187408677722c1307db --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vipllava/chunk_13.txt @@ -0,0 +1,3 @@ +For better results, we recommend users to prompt the model with the correct prompt format: + +A chat between a curious human and an artificial intelligence assistant. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vipllava/chunk_14.txt b/chunked/content_aware_chunking/model_doc_vipllava/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ae3f8ab0c1bbf64f25218796a54105b38a5418c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vipllava/chunk_14.txt @@ -0,0 +1,4 @@ +The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: \n###Assistant: +For multiple turns conversation: + +A chat between a curious human and an artificial intelligence assistant. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vipllava/chunk_15.txt b/chunked/content_aware_chunking/model_doc_vipllava/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c2a050b6554c6fb3d20b98f76edde195a6f1cb0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vipllava/chunk_15.txt @@ -0,0 +1,2 @@ +The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: \n###Assistant: ###Human: ###Assistant: +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vipllava/chunk_16.txt b/chunked/content_aware_chunking/model_doc_vipllava/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..f87bbca4c6d2ebbde49d0dbdc9e18f5341dd0f36 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vipllava/chunk_16.txt @@ -0,0 +1,6 @@ +This model was contributed by Younes Belkada +VipLlavaConfig +[[autodoc]] VipLlavaConfig +VipLlavaForConditionalGeneration +[[autodoc]] VipLlavaForConditionalGeneration + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vipllava/chunk_8.txt b/chunked/content_aware_chunking/model_doc_vipllava/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2266e795f5a65e2a08ea31bc5c5fe293f7cd4f8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vipllava/chunk_8.txt @@ -0,0 +1 @@ +Code, data, and model are publicly available. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vipllava/chunk_9.txt b/chunked/content_aware_chunking/model_doc_vipllava/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..72bae01bd9da2d9d1840d3e7a23216ccbef299d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vipllava/chunk_9.txt @@ -0,0 +1,3 @@ +Tips: + +The architecture is similar than llava architecture except that the multi-modal projector takes a set of concatenated vision hidden states and has an additional layernorm layer on that module. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_10.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..23264d874f7f1cc905f716f5de4190eaa4e2add8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_10.txt @@ -0,0 +1,2 @@ +In the following example, we show how to do this using the default [ViTModel] configuration for the encoder +and the default [BertForCausalLM] configuration for the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_11.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3e91583bdfdd4ca78bbf76a468d57185645e7ec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_11.txt @@ -0,0 +1,9 @@ +thon + +from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel +config_encoder = ViTConfig() +config_decoder = BertConfig() +config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder) +model = VisionEncoderDecoderModel(config=config) + +Initialising VisionEncoderDecoderModel from a pretrained encoder and a pretrained decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_12.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d9fe446e7ed8166c62c68e19416c288d1cfd274 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_12.txt @@ -0,0 +1 @@ +[VisionEncoderDecoderModel] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_13.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..12fd3671a92b242572ce5507b7e3f2f76b45bbdd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_13.txt @@ -0,0 +1 @@ +Note that any pretrained Transformer-based vision model, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_14.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..e89b2eef2c607d74466196c2d1130eccfd66ba3f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_14.txt @@ -0,0 +1 @@ +Swin, can serve as the encoder and both pretrained auto-encoding models, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_15.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..9884768ca8473d038554cd6f2053ad009b4870b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_15.txt @@ -0,0 +1 @@ +BERT, pretrained causal language models, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_16.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..26811a35577f522a9e38509e0f4fec14461f14d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_16.txt @@ -0,0 +1 @@ +GPT2, as well as the pretrained decoder part of sequence-to-sequence models, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_17.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8df167b6ff1390dd23d0cf76ff607af04532696c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_17.txt @@ -0,0 +1 @@ +decoder of BART, can be used as the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_18.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd7d22e2c0189856696b62696c816fec60e2b622 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_18.txt @@ -0,0 +1 @@ +Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_19.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c8a791e71248150690a1d05905ec31012bea64a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_19.txt @@ -0,0 +1 @@ +Initializing [VisionEncoderDecoderModel] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in the Warm-starting-encoder-decoder blog post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_20.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c09716020848e295fbe1f71ea4aa5e8e02e1761 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_20.txt @@ -0,0 +1 @@ +To do so, the VisionEncoderDecoderModel class provides a [VisionEncoderDecoderModel.from_encoder_decoder_pretrained] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_21.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..c87b369586a4dfff13ad0a73ebbcc9614afe44e1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_21.txt @@ -0,0 +1,8 @@ +thon + +from transformers import VisionEncoderDecoderModel +model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( + "microsoft/swin-base-patch4-window7-224-in22k", "google-bert/bert-base-uncased" + ) + +Loading an existing VisionEncoderDecoderModel checkpoint and perform inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_22.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..053090211e1bc7a58356cd7632ab7a1c657a0085 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_22.txt @@ -0,0 +1 @@ +To load fine-tuned checkpoints of the VisionEncoderDecoderModel class, [VisionEncoderDecoderModel] provides the from_pretrained() method just like any other model architecture in Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_23.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..e55dc314fb376c66035398fd8c7b4ace794cafd4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_23.txt @@ -0,0 +1 @@ +To perform inference, one uses the [generate] method, which allows to autoregressively generate text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_24.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..f29d5c272c8190c62073648395190c4babe6a276 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_24.txt @@ -0,0 +1 @@ +This method supports various forms of decoding, such as greedy, beam search and multinomial sampling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_25.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..eea70695e4cf70161e0645c8ebd6aa372aa5d1ef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_25.txt @@ -0,0 +1,20 @@ +thon + +import requests +from PIL import Image +from transformers import GPT2TokenizerFast, ViTImageProcessor, VisionEncoderDecoderModel +load a fine-tuned image captioning model and corresponding tokenizer and image processor +model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") +tokenizer = GPT2TokenizerFast.from_pretrained("nlpconnect/vit-gpt2-image-captioning") +image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning") +let's perform inference on an image +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +pixel_values = image_processor(image, return_tensors="pt").pixel_values +autoregressively generate caption (uses greedy decoding by default) +generated_ids = model.generate(pixel_values) +generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] +print(generated_text) +a cat laying on a blanket next to a cat laying on a bed + +Loading a PyTorch checkpoint into TFVisionEncoderDecoderModel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_26.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1c57d7b361d8aacc1b47563d9e21e7adb9d9e16 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_26.txt @@ -0,0 +1,2 @@ +[TFVisionEncoderDecoderModel.from_pretrained] currently doesn't support initializing the model from a +PyTorch checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_27.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4276f4671a622acf63e86454add1a9998c15313 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_27.txt @@ -0,0 +1 @@ +Passing from_pt=True to this method will throw an exception. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_28.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd528a7b5942ef419cb8c2976b1f3ddd5fbdf157 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_28.txt @@ -0,0 +1,12 @@ +If there are only PyTorch +checkpoints for a particular vision encoder-decoder model, a workaround is: +thon + +from transformers import VisionEncoderDecoderModel, TFVisionEncoderDecoderModel +_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning") +_model.encoder.save_pretrained("./encoder") +_model.decoder.save_pretrained("./decoder") +model = TFVisionEncoderDecoderModel.from_encoder_decoder_pretrained( + "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True + ) +This is only for copying some specific attributes of this particular model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_29.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f733f525258fd86921e073dcd6123efaf3b8ab5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_29.txt @@ -0,0 +1,4 @@ +model.config = _model.config + +Training +Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model on a dataset of (image, text) pairs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_30.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..51c60189d0a51068614bfb0bbce10b9d02f22358 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_30.txt @@ -0,0 +1,2 @@ +As you can see, only 2 inputs are required for the model in order to compute a loss: pixel_values (which are the +images) and labels (which are the input_ids of the encoded target sequence). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_31.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab66fe34b3fc00996c404e1853da113f6e84976b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_31.txt @@ -0,0 +1,22 @@ +thon + +from transformers import ViTImageProcessor, BertTokenizer, VisionEncoderDecoderModel +from datasets import load_dataset +image_processor = ViTImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") +tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") +model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained( + "google/vit-base-patch16-224-in21k", "google-bert/bert-base-uncased" + ) +model.config.decoder_start_token_id = tokenizer.cls_token_id +model.config.pad_token_id = tokenizer.pad_token_id +dataset = load_dataset("huggingface/cats-image") +image = dataset["test"]["image"][0] +pixel_values = image_processor(image, return_tensors="pt").pixel_values +labels = tokenizer( + "an image of two cats chilling on a couch", + return_tensors="pt", + ).input_ids +the forward function automatically creates the correct decoder_input_ids +loss = model(pixel_values=pixel_values, labels=labels).loss + +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_32.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cac88ea6f186c7ee6c1bb761c3a083de704bf7f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_32.txt @@ -0,0 +1,2 @@ +This model's TensorFlow and Flax versions +were contributed by ydshieh. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_33.txt b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..870ed268c35662cc1d963201600e769ae8b1a182 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-encoder-decoder/chunk_33.txt @@ -0,0 +1,17 @@ +VisionEncoderDecoderConfig +[[autodoc]] VisionEncoderDecoderConfig + +VisionEncoderDecoderModel +[[autodoc]] VisionEncoderDecoderModel + - forward + - from_encoder_decoder_pretrained + +TFVisionEncoderDecoderModel +[[autodoc]] TFVisionEncoderDecoderModel + - call + - from_encoder_decoder_pretrained + +FlaxVisionEncoderDecoderModel +[[autodoc]] FlaxVisionEncoderDecoderModel + - call + - from_encoder_decoder_pretrained \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_3.txt b/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a17070c027f3eab835178c5cf2ce9782a673b53 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_3.txt @@ -0,0 +1,2 @@ +Two projection layers are added on top of both the vision and text encoder to project the output embeddings +to a shared latent space. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_4.txt b/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..a16a8d1954fe7df741e14611d72d65d1b6b5497f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_4.txt @@ -0,0 +1,2 @@ +The projection layers are randomly initialized so the model should be fine-tuned on a +downstream task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_5.txt b/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee9f91006f705d27ac8e120068ab3aa7b37774f2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_5.txt @@ -0,0 +1,2 @@ +This model can be used to align the vision-text embeddings using CLIP like contrastive image-text +training and then can be used for zero-shot vision tasks such image-classification or retrieval. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_6.txt b/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..05e45e2103fb7bd1cf5044f678c72162f781078e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_6.txt @@ -0,0 +1,3 @@ +In LiT: Zero-Shot Transfer with Locked-image Text Tuning it is shown how +leveraging pre-trained (locked/frozen) image and text model for contrastive learning yields significant improvement on +new zero-shot vision tasks such as image classification or retrieval. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_7.txt b/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..2eae893f99ca69c1573ac4848c75949f91c8908b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vision-text-dual-encoder/chunk_7.txt @@ -0,0 +1,16 @@ +VisionTextDualEncoderConfig +[[autodoc]] VisionTextDualEncoderConfig +VisionTextDualEncoderProcessor +[[autodoc]] VisionTextDualEncoderProcessor + +VisionTextDualEncoderModel +[[autodoc]] VisionTextDualEncoderModel + - forward + +FlaxVisionTextDualEncoderModel +[[autodoc]] FlaxVisionTextDualEncoderModel + - call + +TFVisionTextDualEncoderModel +[[autodoc]] TFVisionTextDualEncoderModel + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef2fc03feafcbbdcf989a3e0a3d3e4eba89112ee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_10.txt @@ -0,0 +1,3 @@ +Other + checkpoints provided are the fine-tuned checkpoints for down-stream tasks - VQA ('visualbert-vqa'), VCR + ('visualbert-vcr'), NLVR2 ('visualbert-nlvr2'). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..7408766d5eee24c9afaf7519b89666c0d3bcf05d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_11.txt @@ -0,0 +1,2 @@ +Hence, if you are not working on these downstream tasks, it is + recommended that you use the pretrained checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..2364d60e23c7413aef30b30f4304306829863a6b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_12.txt @@ -0,0 +1 @@ +For the VCR task, the authors use a fine-tuned detector for generating visual embeddings, for all the checkpoints. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3d0c1321921380e432e2894f3616011bf11595c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_13.txt @@ -0,0 +1,2 @@ +We do not provide the detector and its weights as a part of the package, but it will be available in the research + projects, and the states can be loaded directly into the detector provided. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_14.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c673011dd40d4a62829d0837009f0efcfb8bf9a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_14.txt @@ -0,0 +1 @@ +VisualBERT is a multi-modal vision and language model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_15.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4f686885e98c22a8d477ab916a7c6eafa461a92 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_15.txt @@ -0,0 +1,2 @@ +It can be used for visual question answering, multiple choice, +visual reasoning and region-to-phrase correspondence tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_16.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a77e397a6fb8b4c949ee5f2da02076a797e9f32 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_16.txt @@ -0,0 +1,2 @@ +VisualBERT uses a BERT-like transformer to prepare +embeddings for image-text pairs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_17.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..381806bef0fb727404d0082fa8b0b55e3faded4e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_17.txt @@ -0,0 +1,2 @@ +Both the text and visual features are then projected to a latent space with identical +dimension. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_18.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..a192410e199bcd66cfe9b26006e969dffcc8b258 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_18.txt @@ -0,0 +1,2 @@ +To feed images to the model, each image is passed through a pre-trained object detector and the regions and the +bounding boxes are extracted. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_19.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..ddec247b280cc515d19618100579b82777d6d42d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_19.txt @@ -0,0 +1,2 @@ +The authors use the features generated after passing these regions through a pre-trained +CNN like ResNet as visual embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_20.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..470c62eef4692e28bf46f223ab078c914b5eb1ae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_20.txt @@ -0,0 +1,2 @@ +They also add absolute position embeddings, and feed the resulting sequence of +vectors to a standard BERT model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_21.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..275da467d136ef57c647b04b89ea4d44543fab4f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_21.txt @@ -0,0 +1,2 @@ +The text input is concatenated in the front of the visual embeddings in the embedding +layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_22.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c7d0315b5905c9cebaa23fdeeb34b038e3c2d8e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_22.txt @@ -0,0 +1,2 @@ +The segment IDs must also be set +appropriately for the textual and visual parts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_23.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7027cd3f7dbfcd574c1dc40a1bc1b28f03c65a5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_23.txt @@ -0,0 +1 @@ +The [BertTokenizer] is used to encode the text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_24.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..aec619eda1ce70e27867d54986c53cfcc72e0d5b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_24.txt @@ -0,0 +1,2 @@ +A custom detector/image processor must be used +to get the visual embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_25.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..15187cde068001912259213cba09cfa3ba18ec53 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_25.txt @@ -0,0 +1,4 @@ +The following example notebooks show how to use VisualBERT with Detectron-like models: + +VisualBERT VQA demo notebook : This notebook + contains an example on VisualBERT VQA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_26.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..09d4099a6b7dde1a634ad2cd82e200126021b8b4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_26.txt @@ -0,0 +1,2 @@ +Generate Embeddings for VisualBERT (Colab Notebook) : This notebook contains + an example on how to generate visual embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_27.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..1601963f253dbb60f2f37ed58398973c6f136d25 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_27.txt @@ -0,0 +1,8 @@ +The following example shows how to get the last hidden state using [VisualBertModel]: +thon + +import torch +from transformers import BertTokenizer, VisualBertModel +model = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre") +tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") +inputs = tokenizer("What is the man eating? \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_visual_bert/chunk_28.txt b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6b0fd0f393950a5e0167d368cbb3ff674aee918 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_visual_bert/chunk_28.txt @@ -0,0 +1,35 @@ +", return_tensors="pt") +this is a custom function that returns the visual embeddings given the image path +visual_embeds = get_visual_embeddings(image_path) +visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long) +visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float) +inputs.update( + { + "visual_embeds": visual_embeds, + "visual_token_type_ids": visual_token_type_ids, + "visual_attention_mask": visual_attention_mask, + } + ) +outputs = model(**inputs) +last_hidden_state = outputs.last_hidden_state + +VisualBertConfig +[[autodoc]] VisualBertConfig +VisualBertModel +[[autodoc]] VisualBertModel + - forward +VisualBertForPreTraining +[[autodoc]] VisualBertForPreTraining + - forward +VisualBertForQuestionAnswering +[[autodoc]] VisualBertForQuestionAnswering + - forward +VisualBertForMultipleChoice +[[autodoc]] VisualBertForMultipleChoice + - forward +VisualBertForVisualReasoning +[[autodoc]] VisualBertForVisualReasoning + - forward +VisualBertForRegionToPhraseAlignment +[[autodoc]] VisualBertForRegionToPhraseAlignment + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_15.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbb12fe625b5b3e426f871e532e4f1e3af040315 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_15.txt @@ -0,0 +1,2 @@ +BEiT models outperform supervised pre-trained + vision transformers using a self-supervised method inspired by BERT (masked image modeling) and based on a VQ-VAE. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_16.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..14024301f3448f3e513fa331f2302d03355b6291 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_16.txt @@ -0,0 +1 @@ +DINO (a method for self-supervised training of Vision Transformers) by Facebook AI. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_17.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..4189a1b38c0ac97a0da4dc6166e184ef8d5c6e8a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_17.txt @@ -0,0 +1,2 @@ +Vision Transformers trained using + the DINO method show very interesting properties not seen with convolutional models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_18.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f3f085cb4d40bb1f6845c889ab4e489d5b477ed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_18.txt @@ -0,0 +1,2 @@ +They are capable of segmenting + objects, without having ever been trained to do so. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_19.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..dab864fd43e75edfaaa48b8942e2b6180f6b96cf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_19.txt @@ -0,0 +1 @@ +DINO checkpoints can be found on the hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_20.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..c68b5ff06a45b6a1464dfd1bf6b4d5dda089fb8c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_20.txt @@ -0,0 +1 @@ +MAE (Masked Autoencoders) by Facebook AI. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_21.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab72bc91c9a82d8dd4c27f08fbe5887b3e170829 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_21.txt @@ -0,0 +1,3 @@ +By pre-training Vision Transformers to reconstruct pixel values for a high portion + (75%) of masked patches (using an asymmetric encoder-decoder architecture), the authors show that this simple method outperforms + supervised pre-training after fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_22.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_22.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_23.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d0e7b99e79ba0c0f094eb41e9179b9529cab225 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_23.txt @@ -0,0 +1,2 @@ +The original code (written in JAX) can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_24.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..3eb244a12badfd9081b88aff501188677304b286 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_24.txt @@ -0,0 +1,2 @@ +Note that we converted the weights from Ross Wightman's timm library, +who already converted the weights from JAX to PyTorch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_25.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..f88d0bd0adae8c606523f7ad412a1158ea8ff308 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_25.txt @@ -0,0 +1 @@ +Credits go to him! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_26.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5915547cea3f68b8d256bd5f34ba71cb337630a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_26.txt @@ -0,0 +1,4 @@ +Usage tips + +To feed images to the Transformer encoder, each image is split into a sequence of fixed-size non-overlapping patches, + which are then linearly embedded. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_27.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d41c4dbd36944cb6283f4e358a7f2443fd81998 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_27.txt @@ -0,0 +1,2 @@ +A [CLS] token is added to serve as representation of an entire image, which can be + used for classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_28.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..0808d1ef54d0156ec8c7ad68fb304ef32f3f19ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_28.txt @@ -0,0 +1,2 @@ +The authors also add absolute position embeddings, and feed the resulting sequence of + vectors to a standard Transformer encoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_29.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..a56edb74177c46fc1b7f89d0db382574af5d6240 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_29.txt @@ -0,0 +1,2 @@ +As the Vision Transformer expects each image to be of the same size (resolution), one can use + [ViTImageProcessor] to resize (or rescale) and normalize images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_30.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bb110e140728327213fd6eecfc806937373f9fc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_30.txt @@ -0,0 +1,2 @@ +Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of + each checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_31.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b52a7427bb4e3510d1f54eb39a031ae09c2a0d8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_31.txt @@ -0,0 +1,2 @@ +For example, google/vit-base-patch16-224 refers to a base-sized architecture with patch + resolution of 16x16 and fine-tuning resolution of 224x224. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_32.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..e67ff0b65f0c0a39dd9635e89c149fdccf8fcd69 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_32.txt @@ -0,0 +1 @@ +All checkpoints can be found on the hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_33.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae2b68bff3d01a491a2997072e058af0a6297177 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_33.txt @@ -0,0 +1,3 @@ +The available checkpoints are either (1) pre-trained on ImageNet-21k (a collection of + 14 million images and 21k classes) only, or (2) also fine-tuned on ImageNet (also referred to as ILSVRC 2012, a collection of 1.3 million + images and 1,000 classes). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_34.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..63984329ac9835582313e606f4e2ef4d610f470c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_34.txt @@ -0,0 +1 @@ +The Vision Transformer was pre-trained using a resolution of 224x224. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_35.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd1e8730f8ca63fdba1fe0e28bf3ab7f07b7ac07 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_35.txt @@ -0,0 +1,3 @@ +During fine-tuning, it is often beneficial to + use a higher resolution than pre-training (Touvron et al., 2019), (Kolesnikov + et al., 2020). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_36.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd8f0034d9478b18980d3f5be55bc4219f944897 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_36.txt @@ -0,0 +1,2 @@ +In order to fine-tune at higher resolution, the authors perform + 2D interpolation of the pre-trained position embeddings, according to their location in the original image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_37.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..db4d8afe0c51d596df4022bb95e83fa046fecf55 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_37.txt @@ -0,0 +1 @@ +The best results are obtained with supervised pre-training, which is not the case in NLP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_38.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfa234147aef9a57ef9ff5bd8c4eec3105c98f38 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_38.txt @@ -0,0 +1,3 @@ +The authors also performed + an experiment with a self-supervised pre-training objective, namely masked patched prediction (inspired by masked + language modeling). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_39.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dd3aab276b9b23335bc2c986dc34ba02b0959cc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_39.txt @@ -0,0 +1,2 @@ +With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant + improvement of 2% to training from scratch, but still 4% behind supervised pre-training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_40.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..29877601ebe739a4fb730d6daa6cbd801d524fed --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_40.txt @@ -0,0 +1,2 @@ +Resources +Demo notebooks regarding inference as well as fine-tuning ViT on custom data can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_41.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f490c29c0e451a5f310e385e2f2c233b6f0bf77 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_41.txt @@ -0,0 +1 @@ +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_42.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_42.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_43.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_43.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit/chunk_44.txt b/chunked/content_aware_chunking/model_doc_vit/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a8201b943cf5d1ba1afcd55a207e7cc84d49a06 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit/chunk_44.txt @@ -0,0 +1,54 @@ +ViTForImageClassification is supported by: + +A blog post on how to Fine-Tune ViT for Image Classification with Hugging Face Transformers +A blog post on Image Classification with Hugging Face Transformers and Keras +A notebook on Fine-tuning for Image Classification with Hugging Face Transformers +A notebook on how to Fine-tune the Vision Transformer on CIFAR-10 with the Hugging Face Trainer +A notebook on how to Fine-tune the Vision Transformer on CIFAR-10 with PyTorch Lightning + +âš—ï¸ Optimization + +A blog post on how to Accelerate Vision Transformer (ViT) with Quantization using Optimum + +âš¡ï¸ Inference + +A notebook on Quick demo: Vision Transformer (ViT) by Google Brain + +🚀 Deploy + +A blog post on Deploying Tensorflow Vision Models in Hugging Face with TF Serving +A blog post on Deploying Hugging Face ViT on Vertex AI +A blog post on Deploying Hugging Face ViT on Kubernetes with TF Serving + +ViTConfig +[[autodoc]] ViTConfig +ViTFeatureExtractor +[[autodoc]] ViTFeatureExtractor + - call +ViTImageProcessor +[[autodoc]] ViTImageProcessor + - preprocess + +ViTModel +[[autodoc]] ViTModel + - forward +ViTForMaskedImageModeling +[[autodoc]] ViTForMaskedImageModeling + - forward +ViTForImageClassification +[[autodoc]] ViTForImageClassification + - forward + +TFViTModel +[[autodoc]] TFViTModel + - call +TFViTForImageClassification +[[autodoc]] TFViTForImageClassification + - call + +FlaxVitModel +[[autodoc]] FlaxViTModel + - call +FlaxViTForImageClassification +[[autodoc]] FlaxViTForImageClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_10.txt b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e0abf5e230062e6f041c07fdf684bb0cefdabf7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_10.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT Hybrid. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_11.txt b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..102d8d9c0ff7dd6f83484dcda070618f967a557c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_11.txt @@ -0,0 +1 @@ +[ViTHybridForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_12.txt b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_12.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_13.txt b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_13.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_14.txt b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..cedcdd5e4f3c29fdea9fe90d754ea5c4a5c57bd2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_14.txt @@ -0,0 +1,11 @@ +ViTHybridConfig +[[autodoc]] ViTHybridConfig +ViTHybridImageProcessor +[[autodoc]] ViTHybridImageProcessor + - preprocess +ViTHybridModel +[[autodoc]] ViTHybridModel + - forward +ViTHybridForImageClassification +[[autodoc]] ViTHybridForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_6.txt b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..4300811e7dc2eb98fe2819930d1d4012e9223350 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_6.txt @@ -0,0 +1,2 @@ +When pre-trained on large amounts of +data and transferred to multiple mid-sized or small image recognition benchmarks (ImageNet, CIFAR-100, VTAB, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_7.txt b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..1539861a983a33bfabb9232031a37f50454368f5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_7.txt @@ -0,0 +1,3 @@ +), +Vision Transformer (ViT) attains excellent results compared to state-of-the-art convolutional networks while requiring +substantially fewer computational resources to train. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_8.txt b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_9.txt b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d0e7b99e79ba0c0f094eb41e9179b9529cab225 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_hybrid/chunk_9.txt @@ -0,0 +1,2 @@ +The original code (written in JAX) can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_10.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..3eb34fb582b1f3c993ffa7c47bebbc33445638bc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_10.txt @@ -0,0 +1 @@ +MAE architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_11.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_11.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_12.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_12.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_13.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..799a28d6990cbfe11a2a1ad4868ec10e9815efdd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_13.txt @@ -0,0 +1,2 @@ +TensorFlow version of the model was contributed by sayakpaul and +ariG23498 (equal contribution). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_14.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_14.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_15.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..07c785c83fdc8025fdf0925ff10e4a7673f68a70 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_15.txt @@ -0,0 +1,3 @@ +Usage tips + +MAE (masked auto encoding) is a method for self-supervised pre-training of Vision Transformers (ViTs). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_16.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c2ff38dbf3e1a38fce4ceddd33018112176c49f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_16.txt @@ -0,0 +1,2 @@ +The pre-training objective is relatively simple: +by masking a large portion (75%) of the image patches, the model must reconstruct raw pixel values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_17.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e4ced7fb838d10751af19f7156a768e98d9ccb6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_17.txt @@ -0,0 +1 @@ +One can use [ViTMAEForPreTraining] for this purpose. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_18.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..5de50f646d9525a58494d53e783ae30cb54dc27b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_18.txt @@ -0,0 +1 @@ +After pre-training, one "throws away" the decoder used to reconstruct pixels, and one uses the encoder for fine-tuning/linear probing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_19.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..2bf5d5fca81b1b382087c66511b00f068bee1718 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_19.txt @@ -0,0 +1,2 @@ +This means that after +fine-tuning, one can directly plug in the weights into a [ViTForImageClassification]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_20.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..05e5bfe09418321163a6024be4c27f75db42268b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_20.txt @@ -0,0 +1 @@ +One can use [ViTImageProcessor] to prepare images for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_21.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7b4c5219ffa60c41651d897e5bc503975bbf40e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_21.txt @@ -0,0 +1 @@ +See the code examples for more info. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_22.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e5ab8199bf4ee8c527b45af9a95c5124fe1da83 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_22.txt @@ -0,0 +1 @@ +Note that the encoder of MAE is only used to encode the visual patches. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_23.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..c34a445243385c5c175b5677cac11415b608c7e4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_23.txt @@ -0,0 +1,2 @@ +The encoded patches are then concatenated with mask tokens, which the decoder (which also +consists of Transformer blocks) takes as input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_24.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..f93eb3dd978b2ccd6002d3830cce0d333d1eb89a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_24.txt @@ -0,0 +1 @@ +Each mask token is a shared, learned vector that indicates the presence of a missing patch to be predicted. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_25.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2ed985a3b90437df1d27369a08e083645e8beb7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_25.txt @@ -0,0 +1,2 @@ +Fixed +sin/cos position embeddings are added both to the input of the encoder and the decoder. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_26.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..41e693f37f6c29a7f67968c8982d6489ff687c7f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_26.txt @@ -0,0 +1 @@ +For a visual understanding of how MAEs work you can check out this post. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_27.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..871cbb9440bc40726931590f059b60a36186cd36 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_27.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTMAE. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_28.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..18025d1042aed5ca5fd8da5114a073e0203d75db --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_28.txt @@ -0,0 +1 @@ +[ViTMAEForPreTraining] is supported by this example script, allowing you to pre-train the model from scratch/further pre-train the model on custom data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_29.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cb568fb0190437e032c7c7d42e3b4b29beff099 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_29.txt @@ -0,0 +1 @@ +A notebook that illustrates how to visualize reconstructed pixel values with [ViTMAEForPreTraining] can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_30.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_30.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_31.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_31.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_32.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..c78e29048d6b5bd46f5741629559b99f1db21f9d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_32.txt @@ -0,0 +1,16 @@ +ViTMAEConfig +[[autodoc]] ViTMAEConfig + +ViTMAEModel +[[autodoc]] ViTMAEModel + - forward +ViTMAEForPreTraining +[[autodoc]] transformers.ViTMAEForPreTraining + - forward + +TFViTMAEModel +[[autodoc]] TFViTMAEModel + - call +TFViTMAEForPreTraining +[[autodoc]] transformers.TFViTMAEForPreTraining + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_8.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..de1af9cc8d2f56e7e826e73e85cda9c87adae1db --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_8.txt @@ -0,0 +1,2 @@ +Our scalable approach allows for learning high-capacity +models that generalize well: e.g., a vanilla ViT-Huge model achieves the best accuracy (87.8%) among methods that use only ImageNet-1K data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_mae/chunk_9.txt b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..69ab1560f7c84c46fda920d72c4249c6c9e6f1f9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_mae/chunk_9.txt @@ -0,0 +1,2 @@ +Transfer performance in downstream +tasks outperforms supervised pre-training and shows promising scaling behavior. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_10.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8cff8c38fea4631f992654ede1f8b8b80915761 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by sayakpaul. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_11.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_12.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6abbb227074c861ad65346d8c2ae1848887e1ad5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_12.txt @@ -0,0 +1,3 @@ +Usage tips + +MSN (masked siamese networks) is a method for self-supervised pre-training of Vision Transformers (ViTs). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_13.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cf995e8ad25b6b3b904552fc07a907873f74fe5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_13.txt @@ -0,0 +1,2 @@ +The pre-training +objective is to match the prototypes assigned to the unmasked views of the images to that of the masked views of the same images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_14.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..9855dfbc86d031edb9443bd51eb6ff1a2b250e6d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_14.txt @@ -0,0 +1 @@ +The authors have only released pre-trained weights of the backbone (ImageNet-1k pre-training). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_15.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ae6461439d9b77c1230f5667260905999a81e19 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_15.txt @@ -0,0 +1,2 @@ +So, to use that on your own image classification dataset, +use the [ViTMSNForImageClassification] class which is initialized from [ViTMSNModel]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_16.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..56750133c025d194477c651a047153c0c3ea3290 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_16.txt @@ -0,0 +1,2 @@ +Follow +this notebook for a detailed tutorial on fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_17.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..30587d51641767f800aef3e07cc0d1b32199e6da --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_17.txt @@ -0,0 +1 @@ +MSN is particularly useful in the low-shot and extreme low-shot regimes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_18.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9988244c8b42288e188090706c423669bc03a94 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_18.txt @@ -0,0 +1,2 @@ +Notably, it achieves 75.7% top-1 accuracy with only 1% of ImageNet-1K +labels when fine-tuned. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_19.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ed7f850e3ff8ade389579660654bc705ee114a6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_19.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViT MSN. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_20.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..a209bbad4b5ce018ecb71c4679879dc4c63b47af --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_20.txt @@ -0,0 +1 @@ +[ViTMSNForImageClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_21.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d5241da961e12927ecb82f92195b277b201a40 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_21.txt @@ -0,0 +1,3 @@ +See also: Image classification task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_22.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_22.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_23.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..8064b34dcbfc2e80f16d2de6d0c4bf1e133bf466 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_23.txt @@ -0,0 +1,8 @@ +ViTMSNConfig +[[autodoc]] ViTMSNConfig +ViTMSNModel +[[autodoc]] ViTMSNModel + - forward +ViTMSNForImageClassification +[[autodoc]] ViTMSNForImageClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_7.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..096ba42e68f441714ee6bc27cfb1620d571bcd45 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_7.txt @@ -0,0 +1,3 @@ +For instance, +on ImageNet-1K, with only 5,000 annotated images, our base MSN model achieves 72.4% top-1 accuracy, +and with 1% of ImageNet-1K labels, we achieve 75.7% top-1 accuracy, setting a new state-of-the-art for self-supervised learning on this benchmark. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_8.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbd037d0ff9356bd717022fdb1f6dcc084ecb864 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_8.txt @@ -0,0 +1 @@ +MSN architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vit_msn/chunk_9.txt b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vit_msn/chunk_9.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitdet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_vitdet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..705c24e6099ec15c309c87a12848987122803803 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitdet/chunk_10.txt @@ -0,0 +1,3 @@ +Tips: + +At the moment, only the backbone is available. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitdet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_vitdet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..577a45701ead09bccb3dd27cd6f25b36f4f074db --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitdet/chunk_11.txt @@ -0,0 +1,5 @@ +VitDetConfig +[[autodoc]] VitDetConfig +VitDetModel +[[autodoc]] VitDetModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitdet/chunk_5.txt b/chunked/content_aware_chunking/model_doc_vitdet/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..98899493276c829f97e3f701467a86eb3267a2fd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitdet/chunk_5.txt @@ -0,0 +1 @@ +Surprisingly, we observe: (i) it is sufficient to build a simple feature pyramid from a single-scale feature map (without the common FPN design) and (ii) it is sufficient to use window attention (without shifting) aided with very few cross-window propagation blocks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitdet/chunk_6.txt b/chunked/content_aware_chunking/model_doc_vitdet/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd613ce47a461c506da0cfe8104f0dea2585ff65 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitdet/chunk_6.txt @@ -0,0 +1 @@ +With plain ViT backbones pre-trained as Masked Autoencoders (MAE), our detector, named ViTDet, can compete with the previous leading methods that were all based on hierarchical backbones, reaching up to 61.3 AP_box on the COCO dataset using only ImageNet-1K pre-training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitdet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_vitdet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..7508d80ff68876ac2a9031612c50b9b46838a2d2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitdet/chunk_7.txt @@ -0,0 +1 @@ +We hope our study will draw attention to research on plain-backbone detectors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitdet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_vitdet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitdet/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitdet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_vitdet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitdet/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_10.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_11.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_12.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..314630114f738f152252f8292dbc4f725a8f6418 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_12.txt @@ -0,0 +1 @@ +ViTMatte high-level overview. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_13.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_13.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_14.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbde72c743c28d12ed835e4eb62c4ff5427e854d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_14.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ViTMatte. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_15.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..50afdd530a0cd136a0f9e91a1c7e882077052465 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_15.txt @@ -0,0 +1 @@ +A demo notebook regarding inference with [VitMatteForImageMatting], including background replacement, can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_16.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..251a18307e38d0fbcfa1f8b5306460f06bdd852d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_16.txt @@ -0,0 +1 @@ +The model expects both the image and trimap (concatenated) as input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_17.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..91190ca8fde1db4b366731585996fbe62b946f6a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_17.txt @@ -0,0 +1 @@ +Use [ViTMatteImageProcessor] for this purpose. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_18.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fad49cc541fbddfc3148dc96461b1e7dffcda77 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_18.txt @@ -0,0 +1,8 @@ +VitMatteConfig +[[autodoc]] VitMatteConfig +VitMatteImageProcessor +[[autodoc]] VitMatteImageProcessor + - preprocess +VitMatteForImageMatting +[[autodoc]] VitMatteForImageMatting + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_7.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..0daf051417c8995e56ad3780f1107554d6340bb3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_7.txt @@ -0,0 +1 @@ +To the best of our knowledge, ViTMatte is the first work to unleash the potential of ViT on image matting with concise adaptation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_8.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..7272b05e904fa8fd8520ec030f35d173b581c68e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_8.txt @@ -0,0 +1 @@ +It inherits many superior properties from ViT to matting, including various pretraining strategies, concise architecture design, and flexible inference strategies. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vitmatte/chunk_9.txt b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..293257a3e3b95f3cf647af1749432ed3aae83c9d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vitmatte/chunk_9.txt @@ -0,0 +1 @@ +We evaluate ViTMatte on Composition-1k and Distinctions-646, the most commonly used benchmark for image matting, our method achieves state-of-the-art performance and outperforms prior matting works by a large margin. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_12.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..259256671c138f1b883324c0a37aae9725fc4a4c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_12.txt @@ -0,0 +1 @@ +Our method adopts variational inference augmented with normalizing flows and an adversarial training process, which improves the expressive power of generative modeling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_13.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..9efb64313e636b6bbce7aa4a096b6c4d0d3e7900 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_13.txt @@ -0,0 +1 @@ +We also propose a stochastic duration predictor to synthesize speech with diverse rhythms from input text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_14.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..964335b095ed202b8d0236e6ac7254a5a6005396 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_14.txt @@ -0,0 +1 @@ +With the uncertainty modeling over latent variables and the stochastic duration predictor, our method expresses the natural one-to-many relationship in which a text input can be spoken in multiple ways with different pitches and rhythms. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_15.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..05941cf35cb61fcd1494407dbd6b4342e5238e53 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_15.txt @@ -0,0 +1 @@ +A subjective human evaluation (mean opinion score, or MOS) on the LJ Speech, a single speaker dataset, shows that our method outperforms the best publicly available TTS systems and achieves a MOS comparable to ground truth. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_16.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..65eebfa31e186ea0846dfb6ff0f28d98ab8424ef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_16.txt @@ -0,0 +1,2 @@ +This model can also be used with TTS checkpoints from Massively Multilingual Speech (MMS) +as these checkpoints use the same architecture and a slightly modified tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_17.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c013b14507c450baf3f08eaf4b3f98f247df7d2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_17.txt @@ -0,0 +1 @@ +This model was contributed by Matthijs and sanchit-gandhi. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_18.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_18.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_19.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..5435230f1f2a6d1902bcd3602ffb91af7e2a2dc5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_19.txt @@ -0,0 +1,2 @@ +Usage examples +Both the VITS and MMS-TTS checkpoints can be used with the same API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_20.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..27bf3e0ffbbf2ee77c8b7d19f479dd2bd6c6a054 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_20.txt @@ -0,0 +1,2 @@ +Since the flow-based model is non-deterministic, it +is good practice to set a seed to ensure reproducibility of the outputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_21.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e4bd7923418a169737047b7bc4e00fe2bfc7e2f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_21.txt @@ -0,0 +1,2 @@ +For languages with a Roman alphabet, +such as English or French, the tokenizer can be used directly to pre-process the text inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_22.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..f11663fc451471ffcc457e27c3bfa71e4f761767 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_22.txt @@ -0,0 +1,25 @@ +The following code example +runs a forward pass using the MMS-TTS English checkpoint: +thon +import torch +from transformers import VitsTokenizer, VitsModel, set_seed +tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") +model = VitsModel.from_pretrained("facebook/mms-tts-eng") +inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt") +set_seed(555) # make deterministic +with torch.no_grad(): + outputs = model(**inputs) +waveform = outputs.waveform[0] + +The resulting waveform can be saved as a .wav file: +thon +import scipy +scipy.io.wavfile.write("techno.wav", rate=model.config.sampling_rate, data=waveform) + +Or displayed in a Jupyter Notebook / Google Colab: +thon +from IPython.display import Audio +Audio(waveform, rate=model.config.sampling_rate) + +For certain languages with a non-Roman alphabet, such as Arabic, Mandarin or Hindi, the uroman +perl package is required to pre-process the text inputs to the Roman alphabet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_23.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cb34d4e1a9d3e1a605bf2d904d5886c7120fe62 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_23.txt @@ -0,0 +1,9 @@ +You can check whether you require the uroman package for your language by inspecting the is_uroman attribute of +the pre-trained tokenizer: +thon +from transformers import VitsTokenizer +tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng") +print(tokenizer.is_uroman) + +If required, you should apply the uroman package to your text inputs prior to passing them to the VitsTokenizer, +since currently the tokenizer does not support performing the pre-processing itself. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_24.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e4f6c6ba86ae21a00410dcc98081e44d4eaa39e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_24.txt @@ -0,0 +1,6 @@ +To do this, first clone the uroman repository to your local machine and set the bash variable UROMAN to the local path: + +git clone https://github.com/isi-nlp/uroman.git +cd uroman +export UROMAN=$(pwd) +You can then pre-process the text input using the following code snippet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_25.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2ee47f99541641824e79977207342b156f7d8d0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_25.txt @@ -0,0 +1,11 @@ +You can either rely on using the bash variable +UROMAN to point to the uroman repository, or you can pass the uroman directory as an argument to the uromaize function: +thon +import torch +from transformers import VitsTokenizer, VitsModel, set_seed +import os +import subprocess +tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-kor") +model = VitsModel.from_pretrained("facebook/mms-tts-kor") +def uromanize(input_string, uroman_path): + """Convert non-Roman strings to Roman using the uroman perl package.""" \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vits/chunk_26.txt b/chunked/content_aware_chunking/model_doc_vits/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..35acddbca6740106f7efef3da92e0edfcaf38284 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vits/chunk_26.txt @@ -0,0 +1,30 @@ +script_path = os.path.join(uroman_path, "bin", "uroman.pl") +command = ["perl", script_path] + +process = subprocess.Popen(command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE) +# Execute the perl command +stdout, stderr = process.communicate(input=input_string.encode()) + +if process.returncode != 0: + raise ValueError(f"Error {process.returncode}: {stderr.decode()}") + +# Return the output as a string and skip the new-line character at the end +return stdout.decode()[:-1] + +text = "ì´ë´ 무슨 ì¼ì´ì•¼" +uromaized_text = uromanize(text, uroman_path=os.environ["UROMAN"]) +inputs = tokenizer(text=uromaized_text, return_tensors="pt") +set_seed(555) # make deterministic +with torch.no_grad(): + outputs = model(inputs["input_ids"]) +waveform = outputs.waveform[0] + +VitsConfig +[[autodoc]] VitsConfig +VitsTokenizer +[[autodoc]] VitsTokenizer + - call + - save_vocabulary +VitsModel +[[autodoc]] VitsModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vivit/chunk_5.txt b/chunked/content_aware_chunking/model_doc_vivit/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..43c93f2fad71e6c52df267ca7df93201b12c1eb2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vivit/chunk_5.txt @@ -0,0 +1 @@ +Although transformer-based models are known to only be effective when large training datasets are available, we show how we can effectively regularise the model during training and leverage pretrained image models to be able to train on comparatively small datasets. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vivit/chunk_6.txt b/chunked/content_aware_chunking/model_doc_vivit/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6254c53c7fc8345adeb1731585c0cb166dfe76d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vivit/chunk_6.txt @@ -0,0 +1 @@ +We conduct thorough ablation studies, and achieve state-of-the-art results on multiple video classification benchmarks including Kinetics 400 and 600, Epic Kitchens, Something-Something v2 and Moments in Time, outperforming prior methods based on deep 3D convolutional networks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vivit/chunk_7.txt b/chunked/content_aware_chunking/model_doc_vivit/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..3388a19916cbf363376cc9d17ecee7bc11fe937d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vivit/chunk_7.txt @@ -0,0 +1 @@ +This model was contributed by jegormeister. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vivit/chunk_8.txt b/chunked/content_aware_chunking/model_doc_vivit/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..abaef2c969a6e30645562f45c32709d080cede0b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vivit/chunk_8.txt @@ -0,0 +1 @@ +The original code (written in JAX) can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_vivit/chunk_9.txt b/chunked/content_aware_chunking/model_doc_vivit/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..bae5f79c8dba8dbf00a09e23c55fb8e5f36e397b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_vivit/chunk_9.txt @@ -0,0 +1,11 @@ +VivitConfig +[[autodoc]] VivitConfig +VivitImageProcessor +[[autodoc]] VivitImageProcessor + - preprocess +VivitModel +[[autodoc]] VivitModel + - forward +VivitForVideoClassification +[[autodoc]] transformers.VivitForVideoClassification + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_10.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..b30cfde851f3d8fe337958b5cf90ee4a7cb45cb1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_10.txt @@ -0,0 +1 @@ +SeamlessM4T v2 provides the foundation on which our two newest models, SeamlessExpressive and SeamlessStreaming, are initiated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_11.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..674e95a347756dab1d11f40db8fa5361fb827e55 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_11.txt @@ -0,0 +1 @@ +SeamlessExpressive enables translation that preserves vocal styles and prosody. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_12.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..10c875397993b4da2ee4eb86b61a12c71d667b2f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_12.txt @@ -0,0 +1 @@ +Compared to previous efforts in expressive speech research, our work addresses certain underexplored aspects of prosody, such as speech rate and pauses, while also preserving the style of one’s voice. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_13.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fd55b966cb833314eb158300bdec94db980e792 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_13.txt @@ -0,0 +1 @@ +As for SeamlessStreaming, our model leverages the Efficient Monotonic Multihead Attention (EMMA) mechanism to generate low-latency target translations without waiting for complete source utterances. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_14.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfef4835e6e5ee6014c0abc387646ee9c1db0d7d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_14.txt @@ -0,0 +1 @@ +As the first of its kind, SeamlessStreaming enables simultaneous speech-to-speech/text translation for multiple source and target languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_15.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ee671771cb8ef24fba00e5c92fa21f5a307c34c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_15.txt @@ -0,0 +1 @@ +To understand the performance of these models, we combined novel and modified versions of existing automatic metrics to evaluate prosody, latency, and robustness. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_16.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..a11a077bc6cb796273e5184500ee97f2916c5d95 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_16.txt @@ -0,0 +1 @@ +For human evaluations, we adapted existing protocols tailored for measuring the most relevant attributes in the preservation of meaning, naturalness, and expressivity. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_17.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..81cd8c0f2d009806c095aafc5fdc42fdd8205779 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_17.txt @@ -0,0 +1 @@ +To ensure that our models can be used safely and responsibly, we implemented the first known red-teaming effort for multimodal machine translation, a system for the detection and mitigation of added toxicity, a systematic evaluation of gender bias, and an inaudible localized watermarking mechanism designed to dampen the impact of deepfakes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_18.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0ba7aeb1605f03b6847345ef2d63dc8f95d454a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_18.txt @@ -0,0 +1 @@ +Consequently, we bring major components from SeamlessExpressive and SeamlessStreaming together to form Seamless, the first publicly available system that unlocks expressive cross-lingual communication in real-time. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_19.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e811b0b7db31747195bb8e4ba1f14917439de50 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_19.txt @@ -0,0 +1 @@ +In sum, Seamless gives us a pivotal look at the technical foundation needed to turn the Universal Speech Translator from a science fiction concept into a real-world technology. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_20.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..71baa69596e548c4838be096247c94029fe210c4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_20.txt @@ -0,0 +1 @@ +Finally, contributions in this work—including models, code, and a watermark detector—are publicly released and accessible at the link below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_21.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8684d438402dd4aeaf1427564107c702c5c11527 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_21.txt @@ -0,0 +1 @@ +This model was contributed by ylacombe. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_22.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_22.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_23.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..b55501fa979f4cde64adca36a0ad2a05715c0b29 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_23.txt @@ -0,0 +1,3 @@ +Usage tips + +Wav2Vec2-BERT follows the same architecture as Wav2Vec2-Conformer, but employs a causal depthwise convolutional layer and uses as input a mel-spectrogram representation of the audio instead of the raw waveform. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_24.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdd8eb7a2904a879c1aa112c8bee5cc17d824d96 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_24.txt @@ -0,0 +1,2 @@ +Wav2Vec2-BERT can use either no relative position embeddings, Shaw-like position embeddings, Transformer-XL-like position embeddings, or + rotary position embeddings by setting the correct config.position_embeddings_type. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_25.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..7908285e280b2410fd92fcce2ddb38ddb2f48be0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_25.txt @@ -0,0 +1 @@ +Wav2Vec2-BERT also introduces a Conformer-based adapter network instead of a simple convolutional network. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_26.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b99b022372fbab4805024677d4269668a82540f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_26.txt @@ -0,0 +1,3 @@ +Resources + +[Wav2Vec2BertForCTC] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_27.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..2542bc8d4db06a5de216e90757f242bcf23fa617 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_27.txt @@ -0,0 +1 @@ +You can also adapt these notebooks on how to finetune a speech recognition model in English, and how to finetune a speech recognition model in any language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_28.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..d961e8ef7e3abea0a31a69bb2baebb0daf20b186 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_28.txt @@ -0,0 +1 @@ +[Wav2Vec2BertForSequenceClassification] can be used by adapting this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_29.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..38dad1a68320edddc3e7f0dce344b65bce05f9a7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-bert/chunk_29.txt @@ -0,0 +1,27 @@ +See also: Audio classification task guide + +Wav2Vec2BertConfig +[[autodoc]] Wav2Vec2BertConfig +Wav2Vec2BertProcessor +[[autodoc]] Wav2Vec2BertProcessor + - call + - pad + - from_pretrained + - save_pretrained + - batch_decode + - decode +Wav2Vec2BertModel +[[autodoc]] Wav2Vec2BertModel + - forward +Wav2Vec2BertForCTC +[[autodoc]] Wav2Vec2BertForCTC + - forward +Wav2Vec2BertForSequenceClassification +[[autodoc]] Wav2Vec2BertForSequenceClassification + - forward +Wav2Vec2BertForAudioFrameClassification +[[autodoc]] Wav2Vec2BertForAudioFrameClassification + - forward +Wav2Vec2BertForXVector +[[autodoc]] Wav2Vec2BertForXVector + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_2.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..aaf677dc718264fa83d9a888e89e32d56f91fcc8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_2.txt @@ -0,0 +1 @@ +The Wav2Vec2-Conformer weights were released by the Meta AI team within the Fairseq library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_3.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8d9a4a80c39655451ecdeff278033b32e2fe584 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_3.txt @@ -0,0 +1 @@ +This model was contributed by patrickvonplaten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_4.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_4.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_5.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f2816e485604abc40bbf6092ee92b5f08827156 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_5.txt @@ -0,0 +1,4 @@ +Usage tips + +Wav2Vec2-Conformer follows the same architecture as Wav2Vec2, but replaces the Attention-block with a Conformer-block + as introduced in Conformer: Convolution-augmented Transformer for Speech Recognition. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_6.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c36a6cf788c7cfbf944300de217135d926cf9b2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_6.txt @@ -0,0 +1,2 @@ +For the same number of layers, Wav2Vec2-Conformer requires more parameters than Wav2Vec2, but also yields +an improved word error rate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_7.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..536d2932ba5b6eb208c4c248333282a7bca26660 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_7.txt @@ -0,0 +1 @@ +Wav2Vec2-Conformer uses the same tokenizer and feature extractor as Wav2Vec2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_8.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..9487bf2f3a091668d9ccb08b49278f84929f88fb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_8.txt @@ -0,0 +1,2 @@ +Wav2Vec2-Conformer can use either no relative position embeddings, Transformer-XL-like position embeddings, or + rotary position embeddings by setting the correct config.position_embeddings_type. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_9.txt b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..e479ae78b836ee15ecf3851a0a1fb84248e7a777 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2-conformer/chunk_9.txt @@ -0,0 +1,27 @@ +Resources + +Audio classification task guide +Automatic speech recognition task guide + +Wav2Vec2ConformerConfig +[[autodoc]] Wav2Vec2ConformerConfig +Wav2Vec2Conformer specific outputs +[[autodoc]] models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerForPreTrainingOutput +Wav2Vec2ConformerModel +[[autodoc]] Wav2Vec2ConformerModel + - forward +Wav2Vec2ConformerForCTC +[[autodoc]] Wav2Vec2ConformerForCTC + - forward +Wav2Vec2ConformerForSequenceClassification +[[autodoc]] Wav2Vec2ConformerForSequenceClassification + - forward +Wav2Vec2ConformerForAudioFrameClassification +[[autodoc]] Wav2Vec2ConformerForAudioFrameClassification + - forward +Wav2Vec2ConformerForXVector +[[autodoc]] Wav2Vec2ConformerForXVector + - forward +Wav2Vec2ConformerForPreTraining +[[autodoc]] Wav2Vec2ConformerForPreTraining + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3b021a4b761523b6265d7a39b54b24c91128282 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_10.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Wav2Vec2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_11.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_12.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_13.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8118480abbc875ebf436416e07947523c145f6d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_13.txt @@ -0,0 +1 @@ +A notebook on how to leverage a pretrained Wav2Vec2 model for emotion classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_14.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..337e0038e5a0068e3238166495fd513bd006defc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_14.txt @@ -0,0 +1,2 @@ +🌎 +[Wav2Vec2ForCTC] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_15.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..611a8e0169cb34b4cf332d85f92d7b9d7eb08618 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_15.txt @@ -0,0 +1,3 @@ +Audio classification task guide + +A blog post on boosting Wav2Vec2 with n-grams in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_16.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..3557ff4e3a87abbdf690f98e12245587c1459a16 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_16.txt @@ -0,0 +1 @@ +A blog post on how to finetune Wav2Vec2 for English ASR with 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_17.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..432f4722e9dbaf546ac1b40c2721abfbd7411af4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_17.txt @@ -0,0 +1 @@ +A blog post on finetuning XLS-R for Multi-Lingual ASR with 🤗 Transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_18.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..44f0e7d090dcfd77ed3b99feea998c4336610529 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_18.txt @@ -0,0 +1 @@ +A notebook on how to create YouTube captions from any video by transcribing audio with Wav2Vec2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_19.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..c64efdc16fe23e49373d1c83874d52501b5bade5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_19.txt @@ -0,0 +1,2 @@ +🌎 +[Wav2Vec2ForCTC] is supported by a notebook on how to finetune a speech recognition model in English, and how to finetune a speech recognition model in any language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_20.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bdf74fe9760694659b534e304ed16ae1caa641e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_20.txt @@ -0,0 +1,5 @@ +Automatic speech recognition task guide + +🚀 Deploy + +A blog post on how to deploy Wav2Vec2 for Automatic Speech Recognition with Hugging Face's Transformers & Amazon SageMaker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_21.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..e229d78c1da38aad7d1771be5a748dc259ea8548 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_21.txt @@ -0,0 +1,30 @@ +Wav2Vec2Config +[[autodoc]] Wav2Vec2Config +Wav2Vec2CTCTokenizer +[[autodoc]] Wav2Vec2CTCTokenizer + - call + - save_vocabulary + - decode + - batch_decode + - set_target_lang +Wav2Vec2FeatureExtractor +[[autodoc]] Wav2Vec2FeatureExtractor + - call +Wav2Vec2Processor +[[autodoc]] Wav2Vec2Processor + - call + - pad + - from_pretrained + - save_pretrained + - batch_decode + - decode +Wav2Vec2ProcessorWithLM +[[autodoc]] Wav2Vec2ProcessorWithLM + - call + - pad + - from_pretrained + - save_pretrained + - batch_decode + - decode +Decoding multiple audios +If you are planning to decode multiple batches of audios, you should consider using [~Wav2Vec2ProcessorWithLM.batch_decode] and passing an instantiated multiprocessing.Pool. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_22.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c13ade07b0a127fa5aa7943143aec3dcfed1a3e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_22.txt @@ -0,0 +1 @@ +Otherwise, [~Wav2Vec2ProcessorWithLM.batch_decode] performance will be slower than calling [~Wav2Vec2ProcessorWithLM.decode] for each audio individually, as it internally instantiates a new Pool for every call. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_23.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..b97e3dd63d48a38613a26db20dc80de34bae6d3b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_23.txt @@ -0,0 +1,31 @@ +See the example below: +thon + +Let's see how to use a user-managed pool for batch decoding multiple audios +from multiprocessing import get_context +from transformers import AutoTokenizer, AutoProcessor, AutoModelForCTC +from datasets import load_dataset +import datasets +import torch +import model, feature extractor, tokenizer +model = AutoModelForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm").to("cuda") +processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm") +load example dataset +dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000)) +def map_to_array(batch): + batch["speech"] = batch["audio"]["array"] + return batch +prepare speech data for batch inference +dataset = dataset.map(map_to_array, remove_columns=["audio"]) +def map_to_pred(batch, pool): + inputs = processor(batch["speech"], sampling_rate=16_000, padding=True, return_tensors="pt") + inputs = {k: v.to("cuda") for k, v in inputs.items()} + + with torch.no_grad(): + logits = model(**inputs).logits + transcription = processor.batch_decode(logits.cpu().numpy(), pool).text + batch["transcription"] = transcription + return batch + +note: pool should be instantiated after Wav2Vec2ProcessorWithLM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_24.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..a31206486a439829faaec6a38539b76866191063 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_24.txt @@ -0,0 +1,55 @@ +otherwise, the LM won't be available to the pool's sub-processes +select number of processes and batch_size based on number of CPU cores available and on dataset size +with get_context("fork").Pool(processes=2) as pool: + result = dataset.map( + map_to_pred, batched=True, batch_size=2, fn_kwargs={"pool": pool}, remove_columns=["speech"] + ) +result["transcription"][:2] +['MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL', "NOR IS MISTER COULTER'S MANNER LESS INTERESTING THAN HIS MATTER"] + +Wav2Vec2 specific outputs +[[autodoc]] models.wav2vec2_with_lm.processing_wav2vec2_with_lm.Wav2Vec2DecoderWithLMOutput +[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput +[[autodoc]] models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput +[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2BaseModelOutput +[[autodoc]] models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2ForPreTrainingOutput + +Wav2Vec2Model +[[autodoc]] Wav2Vec2Model + - forward +Wav2Vec2ForCTC +[[autodoc]] Wav2Vec2ForCTC + - forward + - load_adapter +Wav2Vec2ForSequenceClassification +[[autodoc]] Wav2Vec2ForSequenceClassification + - forward +Wav2Vec2ForAudioFrameClassification +[[autodoc]] Wav2Vec2ForAudioFrameClassification + - forward +Wav2Vec2ForXVector +[[autodoc]] Wav2Vec2ForXVector + - forward +Wav2Vec2ForPreTraining +[[autodoc]] Wav2Vec2ForPreTraining + - forward + +TFWav2Vec2Model +[[autodoc]] TFWav2Vec2Model + - call +TFWav2Vec2ForSequenceClassification +[[autodoc]] TFWav2Vec2ForSequenceClassification + - call +TFWav2Vec2ForCTC +[[autodoc]] TFWav2Vec2ForCTC + - call + +FlaxWav2Vec2Model +[[autodoc]] FlaxWav2Vec2Model + - call +FlaxWav2Vec2ForCTC +[[autodoc]] FlaxWav2Vec2ForCTC + - call +FlaxWav2Vec2ForPreTraining +[[autodoc]] FlaxWav2Vec2ForPreTraining + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..170bf00511f676f7c4a65b2b6b755c3600fa0fe8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_8.txt @@ -0,0 +1,3 @@ +Usage tips + +Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..86bb83517e667ff3699ca29bedf926e9ec35a687 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2/chunk_9.txt @@ -0,0 +1,2 @@ +Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded + using [Wav2Vec2CTCTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_10.txt b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..32a8ed7aa05634c1c536f1be2caeff06177c9f7f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_10.txt @@ -0,0 +1,3 @@ +Wav2Vec2Phoneme can be fine-tuned on multiple language at once and decode unseen languages in a single forward pass + to a sequence of phonemes +By default, the model outputs a sequence of phonemes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_11.txt b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..639617eae8d0f9957acffd7c05964cdb475dff35 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_11.txt @@ -0,0 +1,2 @@ +In order to transform the phonemes to a sequence of words one + should make use of a dictionary and language model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_12.txt b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..54e6ea8417ea6c660014103c7ca4b3acf8872815 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_12.txt @@ -0,0 +1,2 @@ +Wav2Vec2Phoneme's architecture is based on the Wav2Vec2 model, for API reference, check out Wav2Vec2's documentation page +except for the tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_13.txt b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..82f83175c8175fa7a4bb1bddfb5d7e80adfac3e7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_13.txt @@ -0,0 +1,6 @@ +Wav2Vec2PhonemeCTCTokenizer +[[autodoc]] Wav2Vec2PhonemeCTCTokenizer + - call + - batch_decode + - decode + - phonemize \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_4.txt b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d5148e4d0fe1b0f25c973aa16a59c5f9e8fd10c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_4.txt @@ -0,0 +1,2 @@ +This is done by +mapping phonemes of the training languages to the target language using articulatory features. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_5.txt b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..921614d7227df45e07be245cf33f6be4f64a03d5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_5.txt @@ -0,0 +1,3 @@ +Experiments show that +this simple method significantly outperforms prior work which introduced task-specific architectures and used only part +of a monolingually pretrained model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_6.txt b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2d0ce8c2d4a7979e7fe4523b4b4c325655c616c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_6.txt @@ -0,0 +1 @@ +Relevant checkpoints can be found under https://huggingface.co/models?other=phoneme-recognition. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_7.txt b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d7197132e55832bb7a046dc25fc5fbb01feb904 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_7.txt @@ -0,0 +1,2 @@ +This model was contributed by patrickvonplaten +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_8.txt b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..5240196fda65ee4681123952a0999b61300ab832 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_8.txt @@ -0,0 +1,4 @@ +Usage tips + +Wav2Vec2Phoneme uses the exact same architecture as Wav2Vec2 +Wav2Vec2Phoneme is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_9.txt b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..726fd1df0eb44983cdad30becd731acb82555930 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wav2vec2_phoneme/chunk_9.txt @@ -0,0 +1,2 @@ +Wav2Vec2Phoneme model was trained using connectionist temporal classification (CTC) so the model output has to be + decoded using [Wav2Vec2PhonemeCTCTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wavlm/chunk_10.txt b/chunked/content_aware_chunking/model_doc_wavlm/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8d9a4a80c39655451ecdeff278033b32e2fe584 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wavlm/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by patrickvonplaten. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wavlm/chunk_11.txt b/chunked/content_aware_chunking/model_doc_wavlm/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ad58ddb8a8cff91286d83f324a8951d220fafcb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wavlm/chunk_11.txt @@ -0,0 +1,2 @@ +The Authors' code can be +found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wavlm/chunk_12.txt b/chunked/content_aware_chunking/model_doc_wavlm/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b8f2c961a658416a87dbcd970e919c8f2c795c4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wavlm/chunk_12.txt @@ -0,0 +1,3 @@ +Usage tips + +WavLM is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wavlm/chunk_13.txt b/chunked/content_aware_chunking/model_doc_wavlm/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..1478e848873b5a3d12607e233e47595d0e1a5d1a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wavlm/chunk_13.txt @@ -0,0 +1,2 @@ +Please use + [Wav2Vec2Processor] for the feature extraction. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wavlm/chunk_14.txt b/chunked/content_aware_chunking/model_doc_wavlm/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..fef7084bc1685c8fa67f59266986f7b052302de7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wavlm/chunk_14.txt @@ -0,0 +1,2 @@ +WavLM model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded + using [Wav2Vec2CTCTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wavlm/chunk_15.txt b/chunked/content_aware_chunking/model_doc_wavlm/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f535aa8ce48907524d4ab57427076c6122292b2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wavlm/chunk_15.txt @@ -0,0 +1 @@ +WavLM performs especially well on speaker verification, speaker identification, and speaker diarization tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wavlm/chunk_16.txt b/chunked/content_aware_chunking/model_doc_wavlm/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..13d1ce0853275b678e31db74cca45564e6adcf06 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wavlm/chunk_16.txt @@ -0,0 +1,22 @@ +Resources + +Audio classification task guide +Automatic speech recognition task guide + +WavLMConfig +[[autodoc]] WavLMConfig +WavLMModel +[[autodoc]] WavLMModel + - forward +WavLMForCTC +[[autodoc]] WavLMForCTC + - forward +WavLMForSequenceClassification +[[autodoc]] WavLMForSequenceClassification + - forward +WavLMForAudioFrameClassification +[[autodoc]] WavLMForAudioFrameClassification + - forward +WavLMForXVector +[[autodoc]] WavLMForXVector + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wavlm/chunk_6.txt b/chunked/content_aware_chunking/model_doc_wavlm/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..10b4b804fb9366b63d8971fbf0bd81146d132c10 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wavlm/chunk_6.txt @@ -0,0 +1,2 @@ +For better speaker discrimination, we propose an utterance mixing training strategy, where +additional overlapped utterances are created unsupervisedly and incorporated during model training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wavlm/chunk_7.txt b/chunked/content_aware_chunking/model_doc_wavlm/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..07a55cb09e5b8c73b428500651c56a508a736cd5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wavlm/chunk_7.txt @@ -0,0 +1,2 @@ +Lastly, we scale up +the training dataset from 60k hours to 94k hours. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wavlm/chunk_8.txt b/chunked/content_aware_chunking/model_doc_wavlm/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a022fe98f751b7bbcaf6a8c090393ee203a20daa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wavlm/chunk_8.txt @@ -0,0 +1,2 @@ +WavLM Large achieves state-of-the-art performance on the SUPERB +benchmark, and brings significant improvements for various speech processing tasks on their representative benchmarks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_wavlm/chunk_9.txt b/chunked/content_aware_chunking/model_doc_wavlm/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..885db9e19b61b8a95fbd95a830a01ecda8f5aa1b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_wavlm/chunk_9.txt @@ -0,0 +1 @@ +Relevant checkpoints can be found under https://huggingface.co/models?other=wavlm. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_10.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..3109302e9217af5131bcae176d78b530c0d630f8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_10.txt @@ -0,0 +1 @@ +One can use [WhisperProcessor] to prepare audio for the model, and decode the predicted ID's back into text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_11.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..009a4bf45e64fa98927ba2a6a204f34bed53b9f6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_11.txt @@ -0,0 +1,4 @@ +To convert the model and the processor, we recommend using the following: + +python src/transformers/models/whisper/convert_openai_to_hf.py --checkpoint_path "" --pytorch_dump_folder_path "Arthur/whisper-3" --convert_preprocessor True +The script will automatically determine all necessary parameters from the OpenAI checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_12.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f641416124ef92c0b0165fa76d1bacf62e3a8be --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_12.txt @@ -0,0 +1,2 @@ +A tiktoken library needs to be installed +to perform the conversion of the OpenAI tokenizer to the tokenizers version. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_13.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8b3a879805e79ee80901fb68dfdf6df92125d91 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_13.txt @@ -0,0 +1,24 @@ +Inference +Here is a step-by-step guide to transcribing an audio sample using a pre-trained Whisper model: +thon + +from datasets import load_dataset +from transformers import WhisperProcessor, WhisperForConditionalGeneration +Select an audio file and read it: +ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") +audio_sample = ds[0]["audio"] +waveform = audio_sample["array"] +sampling_rate = audio_sample["sampling_rate"] +Load the Whisper model in Hugging Face format: +processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") +model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") +Use the model and processor to transcribe the audio: +input_features = processor( + waveform, sampling_rate=sampling_rate, return_tensors="pt" + ).input_features +Generate token ids +predicted_ids = model.generate(input_features) +Decode token ids to text +transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True) +transcription[0] +' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.' \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_14.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb3a144dfb201bb16700fa2a47fda581ab113f92 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_14.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Whisper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_15.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_15.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_16.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_16.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_17.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..2322d5a39feded41dbbf7952881348eca2712e38 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_17.txt @@ -0,0 +1 @@ +A fork with a script to convert a Whisper model in Hugging Face format to OpenAI format. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_18.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8908b42f83758e46ef51a44bf5f9e373dbf183b9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_18.txt @@ -0,0 +1,74 @@ +🌎 +Usage example: + +pip install -U openai-whisper +python convert_hf_to_openai.py \ + --checkpoint openai/whisper-tiny \ + --whisper_dump_path whisper-tiny-openai.pt + +WhisperConfig +[[autodoc]] WhisperConfig +WhisperTokenizer +[[autodoc]] WhisperTokenizer + - set_prefix_tokens + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + - batch_decode + - decode + - basic_normalize + - normalize +WhisperTokenizerFast +[[autodoc]] WhisperTokenizerFast + - set_prefix_tokens + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary + - batch_decode + - decode + - basic_normalize + - normalize +WhisperFeatureExtractor +[[autodoc]] WhisperFeatureExtractor + - call +WhisperProcessor +[[autodoc]] WhisperProcessor + - call + - from_pretrained + - save_pretrained + - batch_decode + - decode + +WhisperModel +[[autodoc]] WhisperModel + - forward + - _mask_input_features +WhisperForConditionalGeneration +[[autodoc]] WhisperForConditionalGeneration + - forward + - generate +WhisperForCausalLM +[[autodoc]] WhisperForCausalLM + - forward +WhisperForAudioClassification +[[autodoc]] WhisperForAudioClassification + - forward + +TFWhisperModel +[[autodoc]] TFWhisperModel + - call +TFWhisperForConditionalGeneration +[[autodoc]] TFWhisperForConditionalGeneration + - call + +FlaxWhisperModel +[[autodoc]] FlaxWhisperModel + - call +FlaxWhisperForConditionalGeneration +[[autodoc]] FlaxWhisperForConditionalGeneration + - call +FlaxWhisperForAudioClassification +[[autodoc]] FlaxWhisperForAudioClassification + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_7.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_7.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_8.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f3dcc8a02b6c83bf2622434ae46d71ec9522825 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_8.txt @@ -0,0 +1,3 @@ +Usage tips + +The model usually performs well without requiring any finetuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_whisper/chunk_9.txt b/chunked/content_aware_chunking/model_doc_whisper/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca9c3139bf5ef1ddfbc53839ea14cf9a73f188a2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_whisper/chunk_9.txt @@ -0,0 +1 @@ +The architecture follows a classic encoder-decoder architecture, which means that it relies on the [~generation.GenerationMixin.generate] function for inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_10.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1d5ad39594340f4f0e34c5e5829934c84dea4e5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_10.txt @@ -0,0 +1 @@ +In particular, under fully-supervised settings, our approach achieves a top-1 accuracy of 87.1% on Kinectics-400, while using 12 times fewer FLOPs compared with Swin-L and ViViT-H. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_11.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..a921a3e700ccafdbffcda2b423e8bc61a3309feb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_11.txt @@ -0,0 +1 @@ +In zero-shot experiments, our approach surpasses the current state-of-the-art methods by +7.6% and +14.9% in terms of top-1 accuracy under two popular protocols. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_12.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..a12d97304898a8f7e1ee6696ccda5a0bbac307e0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_12.txt @@ -0,0 +1 @@ +In few-shot scenarios, our approach outperforms previous best methods by +32.1% and +23.1% when the labeled data is extremely limited. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_13.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..2993be146db3bfda99175839234068907cabe86f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_13.txt @@ -0,0 +1,3 @@ +Tips: + +Usage of X-CLIP is identical to CLIP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_14.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..96977a8761cf95fce812fa3a4ff34af033d2ab24 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_14.txt @@ -0,0 +1 @@ +X-CLIP architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_15.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_15.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_16.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_16.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_17.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_17.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_18.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..60f2fcccc36b013a8fc6c4b6b255f1e4cd59c37e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_18.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with X-CLIP. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_19.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6401887b7d9a3b942ad5d759917c68eeb9c8338 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_19.txt @@ -0,0 +1 @@ +Demo notebooks for X-CLIP can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_20.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_20.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_21.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_21.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_22.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e5e14141394fa0116b23c5bd503cfacf01ed22e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_22.txt @@ -0,0 +1,20 @@ +XCLIPProcessor +[[autodoc]] XCLIPProcessor +XCLIPConfig +[[autodoc]] XCLIPConfig + - from_text_vision_configs +XCLIPTextConfig +[[autodoc]] XCLIPTextConfig +XCLIPVisionConfig +[[autodoc]] XCLIPVisionConfig +XCLIPModel +[[autodoc]] XCLIPModel + - forward + - get_text_features + - get_video_features +XCLIPTextModel +[[autodoc]] XCLIPTextModel + - forward +XCLIPVisionModel +[[autodoc]] XCLIPVisionModel + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_7.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..995f7999eebd918c7255e43ae073b96f05603974 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_7.txt @@ -0,0 +1 @@ +Such module is lightweight and can be plugged into pretrained language-image models seamlessly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_8.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..96b52d244f144cda4a6758aef63ffd679e107f46 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_8.txt @@ -0,0 +1 @@ +Moreover, we propose a video-specific prompting scheme, which leverages video content information for generating discriminative textual prompts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xclip/chunk_9.txt b/chunked/content_aware_chunking/model_doc_xclip/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd38d89496dccfda70c2767496b87636e5ff1ffa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xclip/chunk_9.txt @@ -0,0 +1 @@ +Extensive experiments demonstrate that our approach is effective and can be generalized to different video recognition scenarios. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xglm/chunk_10.txt b/chunked/content_aware_chunking/model_doc_xglm/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9942f28cf486529b9bb1ad53ea33cc71fec2a74 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xglm/chunk_10.txt @@ -0,0 +1,35 @@ +Resources + +Causal language modeling task guide + +XGLMConfig +[[autodoc]] XGLMConfig +XGLMTokenizer +[[autodoc]] XGLMTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +XGLMTokenizerFast +[[autodoc]] XGLMTokenizerFast + +XGLMModel +[[autodoc]] XGLMModel + - forward +XGLMForCausalLM +[[autodoc]] XGLMForCausalLM + - forward + +TFXGLMModel +[[autodoc]] TFXGLMModel + - call +TFXGLMForCausalLM +[[autodoc]] TFXGLMForCausalLM + - call + +FlaxXGLMModel +[[autodoc]] FlaxXGLMModel + - call +FlaxXGLMForCausalLM +[[autodoc]] FlaxXGLMForCausalLM + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xglm/chunk_7.txt b/chunked/content_aware_chunking/model_doc_xglm/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..00f9b074e248356dde8ba654f2811ff5546d856b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xglm/chunk_7.txt @@ -0,0 +1,2 @@ +Finally, we evaluate our models +in social value tasks such as hate speech detection in five languages and find it has limitations similar to comparable sized GPT-3 models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xglm/chunk_8.txt b/chunked/content_aware_chunking/model_doc_xglm/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..155d5ca3de1517731c876f86bcd2285f7d94321c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xglm/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by Suraj. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xglm/chunk_9.txt b/chunked/content_aware_chunking/model_doc_xglm/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xglm/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c3d0926c3557f620bec98b6a4bd85cc9cc7733a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_10.txt @@ -0,0 +1 @@ +The Authors' code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b3545e4ecfa34bf1d7418e00ee6d512180ef09c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_11.txt @@ -0,0 +1,20 @@ +Resources + +Causal language modeling task guide +Translation task guide +Summarization task guide + +XLMProphetNetConfig +[[autodoc]] XLMProphetNetConfig +XLMProphetNetTokenizer +[[autodoc]] XLMProphetNetTokenizer +XLMProphetNetModel +[[autodoc]] XLMProphetNetModel +XLMProphetNetEncoder +[[autodoc]] XLMProphetNetEncoder +XLMProphetNetDecoder +[[autodoc]] XLMProphetNetDecoder +XLMProphetNetForConditionalGeneration +[[autodoc]] XLMProphetNetForConditionalGeneration +XLMProphetNetForCausalLM +[[autodoc]] XLMProphetNetForCausalLM \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_6.txt b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..987e58fd032e2de5a13ff2ed2109bd4cb40f659f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_6.txt @@ -0,0 +1,2 @@ +The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent +overfitting on strong local correlations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..b11a4d8646b4a7b01bf314143f65a2c898c9fedb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_7.txt @@ -0,0 +1,2 @@ +We pre-train ProphetNet using a base scale dataset (16GB) and a large scale +dataset (160GB) respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb8994110ca39fb5e8fea2bf4efaeffaf4242deb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_8.txt @@ -0,0 +1,2 @@ +Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for +abstractive summarization and question generation tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd0be51aae1b11f030107017059fa099ff9850b3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-prophetnet/chunk_9.txt @@ -0,0 +1,2 @@ +Experimental results show that ProphetNet achieves new +state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_10.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0d1bc42b4cb70f50ee49154c98cd30fd3869363 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_10.txt @@ -0,0 +1,3 @@ +Unlike some XLM multilingual models, it does +not require lang tensors to understand which language is used, and should be able to determine the correct +language from the input ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_11.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc4cf3930a2e193ef3ff7b140a63ba46b1964965 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_11.txt @@ -0,0 +1,32 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +XLMRobertaXLConfig +[[autodoc]] XLMRobertaXLConfig +XLMRobertaXLModel +[[autodoc]] XLMRobertaXLModel + - forward +XLMRobertaXLForCausalLM +[[autodoc]] XLMRobertaXLForCausalLM + - forward +XLMRobertaXLForMaskedLM +[[autodoc]] XLMRobertaXLForMaskedLM + - forward +XLMRobertaXLForSequenceClassification +[[autodoc]] XLMRobertaXLForSequenceClassification + - forward +XLMRobertaXLForMultipleChoice +[[autodoc]] XLMRobertaXLForMultipleChoice + - forward +XLMRobertaXLForTokenClassification +[[autodoc]] XLMRobertaXLForTokenClassification + - forward +XLMRobertaXLForQuestionAnswering +[[autodoc]] XLMRobertaXLForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_4.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fd7c6f0642eb141c6551ae0981891cf8b4006dd --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_4.txt @@ -0,0 +1 @@ +Our model also outperforms the RoBERTa-Large model on several English tasks of the GLUE benchmark by 0.3% on average while handling 99 more languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_5.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..1983b93d3180faff60a2ddef4a1de3ba0a5e2899 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_5.txt @@ -0,0 +1 @@ +This suggests pretrained models with larger capacity may obtain both strong performance on high-resource languages while greatly improving low-resource languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_6.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..eff8e21b073ac9d55c952161a23de3f955dd4c78 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_6.txt @@ -0,0 +1 @@ +We make our code and models publicly available. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_7.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..45912a0d8d29c186a11386aa5bd829cb490293f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_7.txt @@ -0,0 +1 @@ +This model was contributed by Soonhwan-Kwon and stefan-it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_8.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_8.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_9.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce40b67e3b9fbe32f7961b0c789dab1f18df4cef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta-xl/chunk_9.txt @@ -0,0 +1,2 @@ +Usage tips +XLM-RoBERTa-XL is a multilingual model trained on 100 different languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_10.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..93cbb4442d1e14bab327a44d82ff911c1a492d16 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_10.txt @@ -0,0 +1 @@ +This model was contributed by stefan-it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_11.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_11.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_12.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..2dee41e8515bd67d981b4d3e340ad294f58bab41 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_12.txt @@ -0,0 +1,3 @@ +Usage tips + +XLM-RoBERTa is a multilingual model trained on 100 different languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_13.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c43ab82ca9fe69e1a4f55ecedf5f76b05f8b67c2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_13.txt @@ -0,0 +1,3 @@ +Unlike some XLM multilingual models, it does + not require lang tensors to understand which language is used, and should be able to determine the correct + language from the input ids. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_14.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6c0a4032f19ca593be6ea27c043fdaacc326071 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_14.txt @@ -0,0 +1 @@ +Uses RoBERTa tricks on the XLM approach, but does not use the translation language modeling objective. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_15.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..59aada649db7be89b549259e78c4543f2865e8df --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_15.txt @@ -0,0 +1 @@ +It only uses masked language modeling on sentences coming from one language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_16.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e6b0677c09d91e2ea69cde1e4e826fbcdca0213 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_16.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with XLM-RoBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_17.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a7f87b615bc6d943e5399a0b0d3f8ed96d0162 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_17.txt @@ -0,0 +1 @@ +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_18.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_18.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_19.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..03e6b0f8aeb697dd172529e72d580e6768cc61a3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_19.txt @@ -0,0 +1,2 @@ +A blog post on how to finetune XLM RoBERTa for multiclass classification with Habana Gaudi on AWS +[XLMRobertaForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_20.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c100b98eeefbe925ba444e45c2dec9776c9ffc1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_20.txt @@ -0,0 +1 @@ +[TFXLMRobertaForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_21.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee15fc0e71b159a2471f0430ffcb73c6ba0c5a10 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_21.txt @@ -0,0 +1 @@ +[FlaxXLMRobertaForSequenceClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_22.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0047940e9901261e09d7439641c306743cd0efa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_22.txt @@ -0,0 +1 @@ +Text classification chapter of the 🤗 Hugging Face Task Guides. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_23.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..50c2c103d9c7ca3c746b1c0ba20ba6ac9b7db71e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_23.txt @@ -0,0 +1,3 @@ +Text classification task guide + +[XLMRobertaForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_24.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ab38eac618e3a3582c743e27c2e757a3fa9473f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_24.txt @@ -0,0 +1 @@ +[TFXLMRobertaForTokenClassification] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_25.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..61edf7e793d267347b3f0538898e159a6d9de4cf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_25.txt @@ -0,0 +1 @@ +[FlaxXLMRobertaForTokenClassification] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_26.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..69e21faf2c5098fb807509f480ff122a6a2859c7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_26.txt @@ -0,0 +1 @@ +Token classification chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_27.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..812baf941ca75221347e10d2cf9b70770fc49984 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_27.txt @@ -0,0 +1,3 @@ +Token classification task guide + +[XLMRobertaForCausalLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_28.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..1898b715c36e842111ed21e08b5281a3c921cd56 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_28.txt @@ -0,0 +1 @@ +Causal language modeling chapter of the 🤗 Hugging Face Task Guides. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_29.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..c47ce79376485ac9a908dc0c7854321b4a9c633e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_29.txt @@ -0,0 +1,3 @@ +Causal language modeling task guide + +[XLMRobertaForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_30.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1ffbd5b69935d59106d73e9b47a84ef05c43e80 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_30.txt @@ -0,0 +1 @@ +[TFXLMRobertaForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_31.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..501c6e22cc0615d7753c2e025d8c82d3aedf088c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_31.txt @@ -0,0 +1 @@ +[FlaxXLMRobertaForMaskedLM] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_32.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f2b5fefece97efd08b6147d0c598a5443817bec --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_32.txt @@ -0,0 +1 @@ +Masked language modeling chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_33.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..89a44ee203a5cb4ef65edd800de54cf3dff3a7aa --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_33.txt @@ -0,0 +1,3 @@ +Masked language modeling + +[XLMRobertaForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_34.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..919544454ce82ebb8bcb30ca787117aa46ed21b0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_34.txt @@ -0,0 +1 @@ +[TFXLMRobertaForQuestionAnswering] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_35.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..71c0d086534e47b58b2174b53fa161b91fa14fd7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_35.txt @@ -0,0 +1 @@ +[FlaxXLMRobertaForQuestionAnswering] is supported by this example script. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_36.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..38996d3f4fef4d6454d1d2c12acfb05d3bf81ec8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_36.txt @@ -0,0 +1 @@ +Question answering chapter of the 🤗 Hugging Face Course. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_37.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..46ee4b476a6a61af04dd03a5d7f93e6af6318071 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_37.txt @@ -0,0 +1,5 @@ +Question answering task guide + +Multiple choice + +[XLMRobertaForMultipleChoice] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_38.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..315769567d31e8573fc35a5ac514150a8de5c467 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_38.txt @@ -0,0 +1 @@ +[TFXLMRobertaForMultipleChoice] is supported by this example script and notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_39.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..fefcfac60aac1b447f046c91419105007dc93536 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_39.txt @@ -0,0 +1,5 @@ +Multiple choice task guide + +🚀 Deploy + +A blog post on how to Deploy Serverless XLM RoBERTa on AWS Lambda. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_40.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2152ba69372895fb5a3f3ffb1da38a3a4545792 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_40.txt @@ -0,0 +1 @@ +This implementation is the same as RoBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_41.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..327ccb7551171cfa083405813b172ee575f839ab --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_41.txt @@ -0,0 +1 @@ +Refer to the documentation of RoBERTa for usage examples as well as the information relative to the inputs and outputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_42.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..b968f6db4e9efc5ba7ba9cf31d7b04809d239f8f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_42.txt @@ -0,0 +1,76 @@ +XLMRobertaConfig +[[autodoc]] XLMRobertaConfig +XLMRobertaTokenizer +[[autodoc]] XLMRobertaTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +XLMRobertaTokenizerFast +[[autodoc]] XLMRobertaTokenizerFast + +XLMRobertaModel +[[autodoc]] XLMRobertaModel + - forward +XLMRobertaForCausalLM +[[autodoc]] XLMRobertaForCausalLM + - forward +XLMRobertaForMaskedLM +[[autodoc]] XLMRobertaForMaskedLM + - forward +XLMRobertaForSequenceClassification +[[autodoc]] XLMRobertaForSequenceClassification + - forward +XLMRobertaForMultipleChoice +[[autodoc]] XLMRobertaForMultipleChoice + - forward +XLMRobertaForTokenClassification +[[autodoc]] XLMRobertaForTokenClassification + - forward +XLMRobertaForQuestionAnswering +[[autodoc]] XLMRobertaForQuestionAnswering + - forward + +TFXLMRobertaModel +[[autodoc]] TFXLMRobertaModel + - call +TFXLMRobertaForCausalLM +[[autodoc]] TFXLMRobertaForCausalLM + - call +TFXLMRobertaForMaskedLM +[[autodoc]] TFXLMRobertaForMaskedLM + - call +TFXLMRobertaForSequenceClassification +[[autodoc]] TFXLMRobertaForSequenceClassification + - call +TFXLMRobertaForMultipleChoice +[[autodoc]] TFXLMRobertaForMultipleChoice + - call +TFXLMRobertaForTokenClassification +[[autodoc]] TFXLMRobertaForTokenClassification + - call +TFXLMRobertaForQuestionAnswering +[[autodoc]] TFXLMRobertaForQuestionAnswering + - call + +FlaxXLMRobertaModel +[[autodoc]] FlaxXLMRobertaModel + - call +FlaxXLMRobertaForCausalLM +[[autodoc]] FlaxXLMRobertaForCausalLM + - call +FlaxXLMRobertaForMaskedLM +[[autodoc]] FlaxXLMRobertaForMaskedLM + - call +FlaxXLMRobertaForSequenceClassification +[[autodoc]] FlaxXLMRobertaForSequenceClassification + - call +FlaxXLMRobertaForMultipleChoice +[[autodoc]] FlaxXLMRobertaForMultipleChoice + - call +FlaxXLMRobertaForTokenClassification +[[autodoc]] FlaxXLMRobertaForTokenClassification + - call +FlaxXLMRobertaForQuestionAnswering +[[autodoc]] FlaxXLMRobertaForQuestionAnswering + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_8.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc0f617817775b1cb5d91ad5543f452ceea6a4f1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_8.txt @@ -0,0 +1,2 @@ +Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing +per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE and XNLI benchmarks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_9.txt b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b32e0abd439bcb2515fecd264ae2be7836c5abc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-roberta/chunk_9.txt @@ -0,0 +1,2 @@ +We +will make XLM-R code, data, and models publicly available. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-v/chunk_10.txt b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c945adb752934f9e41f9573e6e17e1e2d34fac5e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_10.txt @@ -0,0 +1,4 @@ +Usage tips + +XLM-V is compatible with the XLM-RoBERTa model architecture, only model weights from fairseq + library had to be converted. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-v/chunk_11.txt b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..468a0e2d726473eed52d1af8b1fdc96760ff0c92 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_11.txt @@ -0,0 +1 @@ +The XLMTokenizer implementation is used to load the vocab and performs tokenization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-v/chunk_12.txt b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd85834528eb8861d5b24e3abe0cff95df99c5b3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_12.txt @@ -0,0 +1 @@ +A XLM-V (base size) model is available under the facebook/xlm-v-base identifier. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-v/chunk_13.txt b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..f79b8c79ec7f4cba8fe99e2fbc2922c969ea3f00 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_13.txt @@ -0,0 +1 @@ +XLM-V architecture is the same as XLM-RoBERTa, refer to XLM-RoBERTa documentation for API reference, and examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-v/chunk_4.txt b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca0b36b37a7bb44466de5c3ff1a16079dde9639a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_4.txt @@ -0,0 +1 @@ +This vocabulary bottleneck limits the representational capabilities of multilingual models like XLM-R. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-v/chunk_5.txt b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..829c0ea726e09de150a51cd0c57e02072766c80f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_5.txt @@ -0,0 +1,3 @@ +In this paper, we introduce a new approach for scaling to very large multilingual vocabularies by +de-emphasizing token sharing between languages with little lexical overlap and assigning vocabulary capacity +to achieve sufficient coverage for each individual language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-v/chunk_6.txt b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd4667fcecb57539a1797772e4194d576a870e81 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_6.txt @@ -0,0 +1,3 @@ +Tokenizations using our vocabulary are typically +more semantically meaningful and shorter compared to XLM-R. Leveraging this improved vocabulary, we train XLM-V, +a multilingual language model with a one million token vocabulary. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-v/chunk_7.txt b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..20548dd9961136b43a665c6e038774db8c3edbd9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_7.txt @@ -0,0 +1,3 @@ +XLM-V outperforms XLM-R on every task we +tested on ranging from natural language inference (XNLI), question answering (MLQA, XQuAD, TyDiQA), and +named entity recognition (WikiAnn) to low-resource tasks (Americas NLI, MasakhaNER). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-v/chunk_8.txt b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5f2f5237ee38754f651c2e78d6f00e7bbcf5d74 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by stefan-it, including detailed experiments with XLM-V on downstream tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm-v/chunk_9.txt b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..28254fa34d5477273eed70dc81cbe67208063c34 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm-v/chunk_9.txt @@ -0,0 +1 @@ +The experiments repository can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_10.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_11.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..1af889c5ada74a6cdd5ba60fab961644777c19c5 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_11.txt @@ -0,0 +1,3 @@ +Usage tips + +XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_12.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..65fac60d9f3e043057d3208b54f368312eb72272 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_12.txt @@ -0,0 +1,2 @@ +Make sure to + select the correct objective for your task (e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_13.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffd8f59f15de852797b24142c731adfa6fb8ef7b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_13.txt @@ -0,0 +1 @@ +MLM checkpoints are not suitable for generation). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_14.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8db503ac4ea62bd0b0c5f2e9a274dae43d37cd6 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_14.txt @@ -0,0 +1 @@ +XLM has multilingual checkpoints which leverage a specific lang parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_15.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d49c7de29a3bb22f6e954add78e89d502239cd3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_15.txt @@ -0,0 +1 @@ +Check out the multi-lingual page for more information. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_16.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..6486ef922bda071a852b67e012a6ed89146a1fb0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_16.txt @@ -0,0 +1 @@ +A transformer model trained on several languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_17.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..14351e433d9d6945793cb2beeca42e06b0a0623c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_17.txt @@ -0,0 +1,3 @@ +There are three different type of training for this model and the library provides checkpoints for all of them: + +Causal language modeling (CLM) which is the traditional autoregressive training (so this model could be in the previous section as well). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_18.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8941e3d309b50236beb156805527eeb9c08b5214 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_18.txt @@ -0,0 +1 @@ +One of the languages is selected for each training sample, and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_19.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3a577e8fedb6e5227745b2bb358aabc0087406e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_19.txt @@ -0,0 +1 @@ +Masked language modeling (MLM) which is like RoBERTa. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_20.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d1d6f8bb98161870ed08ce7da071fbe8cd28913 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_20.txt @@ -0,0 +1 @@ +One of the languages is selected for each training sample, and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages, with dynamic masking of the tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_21.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..35fc6168272f39f7dec0aeb085a7502b1fb56a04 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_21.txt @@ -0,0 +1 @@ +A combination of MLM and translation language modeling (TLM). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_22.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc07d69da39ba1f9c879f1ab0d17da2b3f6c3112 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_22.txt @@ -0,0 +1 @@ +This consists of concatenating a sentence in two different languages, with random masking. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_23.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..0859cc86dbb48742f250ef004c6c197e440034e3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_23.txt @@ -0,0 +1 @@ +To predict one of the masked tokens, the model can use both, the surrounding context in language 1 and the context given by language 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_24.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..6db55f176e8d1167b487d95bb1e568911e78a685 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_24.txt @@ -0,0 +1,60 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +XLMConfig +[[autodoc]] XLMConfig +XLMTokenizer +[[autodoc]] XLMTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +XLM specific outputs +[[autodoc]] models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput + +XLMModel +[[autodoc]] XLMModel + - forward +XLMWithLMHeadModel +[[autodoc]] XLMWithLMHeadModel + - forward +XLMForSequenceClassification +[[autodoc]] XLMForSequenceClassification + - forward +XLMForMultipleChoice +[[autodoc]] XLMForMultipleChoice + - forward +XLMForTokenClassification +[[autodoc]] XLMForTokenClassification + - forward +XLMForQuestionAnsweringSimple +[[autodoc]] XLMForQuestionAnsweringSimple + - forward +XLMForQuestionAnswering +[[autodoc]] XLMForQuestionAnswering + - forward + +TFXLMModel +[[autodoc]] TFXLMModel + - call +TFXLMWithLMHeadModel +[[autodoc]] TFXLMWithLMHeadModel + - call +TFXLMForSequenceClassification +[[autodoc]] TFXLMForSequenceClassification + - call +TFXLMForMultipleChoice +[[autodoc]] TFXLMForMultipleChoice + - call +TFXLMForTokenClassification +[[autodoc]] TFXLMForTokenClassification + - call +TFXLMForQuestionAnsweringSimple +[[autodoc]] TFXLMForQuestionAnsweringSimple + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_8.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1fe80c3123b2f6e4fca1c45a430189427c9fcd0 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_8.txt @@ -0,0 +1 @@ +Our code and pretrained models will be made publicly available. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlm/chunk_9.txt b/chunked/content_aware_chunking/model_doc_xlm/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..649897ffdb873807ea08b834f0650ff587d9718e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlm/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by thomwolf. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlnet/chunk_10.txt b/chunked/content_aware_chunking/model_doc_xlnet/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..e72e44f997929c5575e4b5644ef0135c1cfa6f31 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlnet/chunk_10.txt @@ -0,0 +1,2 @@ +Due to the difficulty of training a fully auto-regressive model over various factorization order, XLNet is pretrained + using only a sub-set of the output tokens as target which are selected with the target_mapping input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlnet/chunk_11.txt b/chunked/content_aware_chunking/model_doc_xlnet/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb9f64f5f93d616da8adbde3ea30e1d8d5042055 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlnet/chunk_11.txt @@ -0,0 +1 @@ +To use XLNet for sequential decoding (i.e. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlnet/chunk_12.txt b/chunked/content_aware_chunking/model_doc_xlnet/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..11aefe05e3863691b40a734286563b74a7f8680c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlnet/chunk_12.txt @@ -0,0 +1,4 @@ +not in fully bi-directional setting), use the perm_mask and + target_mapping inputs to control the attention span and outputs (see examples in + examples/pytorch/text-generation/run_generation.py) +XLNet is one of the few models that has no sequence length limit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlnet/chunk_13.txt b/chunked/content_aware_chunking/model_doc_xlnet/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..63f22e5e64ec37225b72c84d2bfac8608f68332c --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlnet/chunk_13.txt @@ -0,0 +1 @@ +XLNet is not a traditional autoregressive model but uses a training strategy that builds on that. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlnet/chunk_14.txt b/chunked/content_aware_chunking/model_doc_xlnet/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..3386534afda51bcb0329e987b894814aab2d75ef --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlnet/chunk_14.txt @@ -0,0 +1 @@ +It permutes the tokens in the sentence, then allows the model to use the last n tokens to predict the token n+1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlnet/chunk_15.txt b/chunked/content_aware_chunking/model_doc_xlnet/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3991ad5e6d601250e181452eddacdee6d2d0e694 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlnet/chunk_15.txt @@ -0,0 +1 @@ +Since this is all done with a mask, the sentence is actually fed in the model in the right order, but instead of masking the first n tokens for n+1, XLNet uses a mask that hides the previous tokens in some given permutation of 1,…,sequence length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlnet/chunk_16.txt b/chunked/content_aware_chunking/model_doc_xlnet/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..796c111d3abf11c17f801acf34844871849e3a71 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlnet/chunk_16.txt @@ -0,0 +1 @@ +XLNet also uses the same recurrence mechanism as Transformer-XL to build long-term dependencies. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlnet/chunk_17.txt b/chunked/content_aware_chunking/model_doc_xlnet/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..54d5b6f109caab33fe10917d8984f3eadc082dbe --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlnet/chunk_17.txt @@ -0,0 +1,73 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Multiple choice task guide + +XLNetConfig +[[autodoc]] XLNetConfig +XLNetTokenizer +[[autodoc]] XLNetTokenizer + - build_inputs_with_special_tokens + - get_special_tokens_mask + - create_token_type_ids_from_sequences + - save_vocabulary +XLNetTokenizerFast +[[autodoc]] XLNetTokenizerFast +XLNet specific outputs +[[autodoc]] models.xlnet.modeling_xlnet.XLNetModelOutput +[[autodoc]] models.xlnet.modeling_xlnet.XLNetLMHeadModelOutput +[[autodoc]] models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput +[[autodoc]] models.xlnet.modeling_xlnet.XLNetForMultipleChoiceOutput +[[autodoc]] models.xlnet.modeling_xlnet.XLNetForTokenClassificationOutput +[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput +[[autodoc]] models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringOutput +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetModelOutput +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetLMHeadModelOutput +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput +[[autodoc]] models.xlnet.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput + +XLNetModel +[[autodoc]] XLNetModel + - forward +XLNetLMHeadModel +[[autodoc]] XLNetLMHeadModel + - forward +XLNetForSequenceClassification +[[autodoc]] XLNetForSequenceClassification + - forward +XLNetForMultipleChoice +[[autodoc]] XLNetForMultipleChoice + - forward +XLNetForTokenClassification +[[autodoc]] XLNetForTokenClassification + - forward +XLNetForQuestionAnsweringSimple +[[autodoc]] XLNetForQuestionAnsweringSimple + - forward +XLNetForQuestionAnswering +[[autodoc]] XLNetForQuestionAnswering + - forward + +TFXLNetModel +[[autodoc]] TFXLNetModel + - call +TFXLNetLMHeadModel +[[autodoc]] TFXLNetLMHeadModel + - call +TFXLNetForSequenceClassification +[[autodoc]] TFXLNetForSequenceClassification + - call +TFLNetForMultipleChoice +[[autodoc]] TFXLNetForMultipleChoice + - call +TFXLNetForTokenClassification +[[autodoc]] TFXLNetForTokenClassification + - call +TFXLNetForQuestionAnsweringSimple +[[autodoc]] TFXLNetForQuestionAnsweringSimple + - call \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlnet/chunk_7.txt b/chunked/content_aware_chunking/model_doc_xlnet/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..649897ffdb873807ea08b834f0650ff587d9718e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlnet/chunk_7.txt @@ -0,0 +1 @@ +This model was contributed by thomwolf. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlnet/chunk_8.txt b/chunked/content_aware_chunking/model_doc_xlnet/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlnet/chunk_8.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlnet/chunk_9.txt b/chunked/content_aware_chunking/model_doc_xlnet/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..91fc06ae923145e6ecb5babe31ba267954caee55 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlnet/chunk_9.txt @@ -0,0 +1,3 @@ +Usage tips + +The specific attention pattern can be controlled at training and test time using the perm_mask input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xls_r/chunk_10.txt b/chunked/content_aware_chunking/model_doc_xls_r/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xls_r/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xls_r/chunk_11.txt b/chunked/content_aware_chunking/model_doc_xls_r/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..679feb46839e836c763d78f393e112a0ca531097 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xls_r/chunk_11.txt @@ -0,0 +1,3 @@ +Usage tips + +XLS-R is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xls_r/chunk_12.txt b/chunked/content_aware_chunking/model_doc_xls_r/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..813d5bf7751eeb586a9e3703e3e2b1f83d77a56a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xls_r/chunk_12.txt @@ -0,0 +1,2 @@ +XLS-R model was trained using connectionist temporal classification (CTC) so the model output has to be decoded using + [Wav2Vec2CTCTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xls_r/chunk_13.txt b/chunked/content_aware_chunking/model_doc_xls_r/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cb7eff9e421cbe9148886c3ce1509b3d9c50ff9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xls_r/chunk_13.txt @@ -0,0 +1 @@ +XLS-R's architecture is based on the Wav2Vec2 model, refer to Wav2Vec2's documentation page for API reference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xls_r/chunk_5.txt b/chunked/content_aware_chunking/model_doc_xls_r/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..c409bcadaee727bbe90a47fb8fb43402dccbe0b8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xls_r/chunk_5.txt @@ -0,0 +1,2 @@ +For speech recognition, XLS-R improves over the best known prior work on BABEL, MLS, CommonVoice as well as +VoxPopuli, lowering error rates by 14-34% relative on average. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xls_r/chunk_6.txt b/chunked/content_aware_chunking/model_doc_xls_r/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9b4d9aafd56c79ed0fcda5dad81d3257e349bb7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xls_r/chunk_6.txt @@ -0,0 +1,2 @@ +XLS-R also sets a new state of the art on VoxLingua107 +language identification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xls_r/chunk_7.txt b/chunked/content_aware_chunking/model_doc_xls_r/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..1043929f2b8dee6642386c8d76435d7f0a258f13 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xls_r/chunk_7.txt @@ -0,0 +1,3 @@ +Moreover, we show that with sufficient model size, cross-lingual pretraining can outperform +English-only pretraining when translating English speech into other languages, a setting which favors monolingual +pretraining. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xls_r/chunk_8.txt b/chunked/content_aware_chunking/model_doc_xls_r/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..673f3ad7b406e8a5784799562b0c748429b08509 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xls_r/chunk_8.txt @@ -0,0 +1 @@ +We hope XLS-R can help to improve speech processing tasks for many more languages of the world. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xls_r/chunk_9.txt b/chunked/content_aware_chunking/model_doc_xls_r/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..047a527667ddefdc01e7dbb2e72c0aeaab09a5f8 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xls_r/chunk_9.txt @@ -0,0 +1 @@ +Relevant checkpoints can be found under https://huggingface.co/models?other=xls_r. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_10.txt b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9c5c9ec1440e528f21eeb376bea1885802899ea --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_10.txt @@ -0,0 +1,3 @@ +Usage tips + +XLSR-Wav2Vec2 is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_11.txt b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..b82da606287fde898e8cb2e129117c19b97504b1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_11.txt @@ -0,0 +1,2 @@ +XLSR-Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be + decoded using [Wav2Vec2CTCTokenizer]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_12.txt b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..2da005737639c86d715895d048264a934e423148 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_12.txt @@ -0,0 +1 @@ +XLSR-Wav2Vec2's architecture is based on the Wav2Vec2 model, so one can refer to Wav2Vec2's documentation page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_4.txt b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..1578248d1178e7eb94e50b947cd703f50db5d1d9 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_4.txt @@ -0,0 +1,2 @@ +On the CommonVoice benchmark, XLSR shows a relative phoneme error rate reduction +of 72% compared to the best known results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_5.txt b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..019dcdc1afc5f4361a70188ea11757b42ee115e2 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_5.txt @@ -0,0 +1,2 @@ +On BABEL, our approach improves word error rate by 16% relative compared to +a comparable system. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_6.txt b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bd25042685c41d0de7272a24818d2d7ae6af0a1 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_6.txt @@ -0,0 +1,2 @@ +Our approach enables a single multilingual speech recognition model which is competitive to strong +individual models. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_7.txt b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..526a25c3bd35460ce69bb6dab6dd14b6fbf0a1e7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_7.txt @@ -0,0 +1,2 @@ +Analysis shows that the latent discrete speech representations are shared across languages with +increased sharing for related languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_8.txt b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d3f17734af6e381c0f52487869bf57ea196d604 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_8.txt @@ -0,0 +1,2 @@ +We hope to catalyze research in low-resource speech understanding by releasing +XLSR-53, a large model pretrained in 53 languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_9.txt b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xlsr_wav2vec2/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xmod/chunk_10.txt b/chunked/content_aware_chunking/model_doc_xmod/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9ce56e9a2ea89891d21ca63eb4d59ac3e410763 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xmod/chunk_10.txt @@ -0,0 +1,3 @@ +Usage tips +Tips: +- X-MOD is similar to XLM-R, but a difference is that the input language needs to be specified so that the correct language adapter can be activated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xmod/chunk_11.txt b/chunked/content_aware_chunking/model_doc_xmod/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b7e69cfe66e0aa002b5df61240ee0a88cc0b02f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xmod/chunk_11.txt @@ -0,0 +1 @@ +- The main models – base and large – have adapters for 81 languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xmod/chunk_12.txt b/chunked/content_aware_chunking/model_doc_xmod/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..c39f3faf5a383668414f52880e087f7bfeebd3cf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xmod/chunk_12.txt @@ -0,0 +1,4 @@ +Adapter Usage +Input language +There are two ways to specify the input language: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xmod/chunk_13.txt b/chunked/content_aware_chunking/model_doc_xmod/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fdaeb9d7583d787a981f8bc0c7d2df746729f48 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xmod/chunk_13.txt @@ -0,0 +1,26 @@ +By setting a default language before using the model: +thon +from transformers import XmodModel +model = XmodModel.from_pretrained("facebook/xmod-base") +model.set_default_language("en_XX") + +By explicitly passing the index of the language adapter for each sample: + +thon +import torch +input_ids = torch.tensor( + [ + [0, 581, 10269, 83, 99942, 136, 60742, 23, 70, 80583, 18276, 2], + [0, 1310, 49083, 443, 269, 71, 5486, 165, 60429, 660, 23, 2], + ] +) +lang_ids = torch.LongTensor( + [ + 0, # en_XX + 8, # de_DE + ] +) +output = model(input_ids, lang_ids=lang_ids) + +Fine-tuning +The paper recommends that the embedding layer and the language adapters are frozen during fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xmod/chunk_14.txt b/chunked/content_aware_chunking/model_doc_xmod/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..9922ca74d3cb0ff4a0de3ab27327f9df37706e07 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xmod/chunk_14.txt @@ -0,0 +1,43 @@ +A method for doing this is provided: +thon +model.freeze_embeddings_and_language_adapters() +Fine-tune the model + +Cross-lingual transfer +After fine-tuning, zero-shot cross-lingual transfer can be tested by activating the language adapter of the target language: +thon +model.set_default_language("de_DE") +Evaluate the model on German examples + +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Causal language modeling task guide +Masked language modeling task guide +Multiple choice task guide + +XmodConfig +[[autodoc]] XmodConfig +XmodModel +[[autodoc]] XmodModel + - forward +XmodForCausalLM +[[autodoc]] XmodForCausalLM + - forward +XmodForMaskedLM +[[autodoc]] XmodForMaskedLM + - forward +XmodForSequenceClassification +[[autodoc]] XmodForSequenceClassification + - forward +XmodForMultipleChoice +[[autodoc]] XmodForMultipleChoice + - forward +XmodForTokenClassification +[[autodoc]] XmodForTokenClassification + - forward +XmodForQuestionAnswering +[[autodoc]] XmodForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xmod/chunk_6.txt b/chunked/content_aware_chunking/model_doc_xmod/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..85cebd876895f51929105eaec56ee214283bba27 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xmod/chunk_6.txt @@ -0,0 +1 @@ +Our experiments on natural language inference, named entity recognition and question answering show that our approach not only mitigates the negative interference between languages, but also enables positive transfer, resulting in improved monolingual and cross-lingual performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xmod/chunk_7.txt b/chunked/content_aware_chunking/model_doc_xmod/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..5945344732025579e9b64912dbf6b73b1524e674 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xmod/chunk_7.txt @@ -0,0 +1 @@ +Furthermore, our approach enables adding languages post-hoc with no measurable drop in performance, no longer limiting the model usage to the set of pre-trained languages. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xmod/chunk_8.txt b/chunked/content_aware_chunking/model_doc_xmod/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..e376f37735cc48bb9325bb89fa82680db87293da --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xmod/chunk_8.txt @@ -0,0 +1 @@ +This model was contributed by jvamvas. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_xmod/chunk_9.txt b/chunked/content_aware_chunking/model_doc_xmod/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc90e425b7fc672f9ae8fe7daffdbcc8e07c7720 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_xmod/chunk_9.txt @@ -0,0 +1 @@ +The original code can be found here and the original documentation is found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_10.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_10.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_11.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ca7b93ad7d5c2f8814c9b9629ea9922f3e57fb7 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_11.txt @@ -0,0 +1,2 @@ +Resources +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with YOLOS. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_12.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..47371e54e3551074ff5048e43fea5d7b7ff7c54b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_12.txt @@ -0,0 +1 @@ +All example notebooks illustrating inference + fine-tuning [YolosForObjectDetection] on a custom dataset can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_13.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..648cbfec7e58238ec9c7eea0f74605f31b779585 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_13.txt @@ -0,0 +1,3 @@ +See also: Object detection task guide + +If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_14.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e528f03775eac0139700091fe31f6016df9265f --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_14.txt @@ -0,0 +1 @@ +The resource should ideally demonstrate something new instead of duplicating an existing resource. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_15.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..0888f1e9e31737c921c61f7bd240dfb1ed8854f3 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_15.txt @@ -0,0 +1 @@ +Use [YolosImageProcessor] for preparing images (and optional targets) for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_16.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..52e4132f8e6ffd966ff0b9c700d98e3c63bcb22e --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_16.txt @@ -0,0 +1 @@ +Contrary to DETR, YOLOS doesn't require a pixel_mask to be created. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_17.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..32df62e4ed5b7a9e91c5641be54e899de81d6845 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_17.txt @@ -0,0 +1,18 @@ +YolosConfig +[[autodoc]] YolosConfig +YolosImageProcessor +[[autodoc]] YolosImageProcessor + - preprocess + - pad + - post_process_object_detection +YolosFeatureExtractor +[[autodoc]] YolosFeatureExtractor + - call + - pad + - post_process_object_detection +YolosModel +[[autodoc]] YolosModel + - forward +YolosForObjectDetection +[[autodoc]] YolosForObjectDetection + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_6.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..31919d550e8095be07cd421c492e18fadac9027d --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_6.txt @@ -0,0 +1 @@ +We also discuss the impacts as well as limitations of current pre-train schemes and model scaling strategies for Transformer in vision through YOLOS. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_7.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..bba5b7806482a8c1b154fee88b958b1dd61af3ae --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_7.txt @@ -0,0 +1 @@ +YOLOS architecture. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_8.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_8.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yolos/chunk_9.txt b/chunked/content_aware_chunking/model_doc_yolos/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e7a37c32483565713220f7f08f1ccf08011144a --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yolos/chunk_9.txt @@ -0,0 +1 @@ +This model was contributed by nielsr. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_10.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..93b83c9cce9970d66fafca2daadc568b624eb223 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_10.txt @@ -0,0 +1,3 @@ +On the Long Range Arena (LRA) benchmark, +for evaluating performance on long sequences, our method achieves results consistent with softmax self-attention but with sizable +speed-ups and memory savings and often outperforms other efficient self-attention methods. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_11.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8bcfdb9cbb19ad823abc348f6587cca536de211 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_11.txt @@ -0,0 +1,2 @@ +Our code is available at this https URL +This model was contributed by novice03. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_12.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..05df901cedd25bf0e7baa83f600faf04901b2a66 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_12.txt @@ -0,0 +1 @@ +The original code can be found here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_13.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..0de3cd34e6ae9ab667a3108db09fd987a1074f4b --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_13.txt @@ -0,0 +1,4 @@ +Usage tips + +The YOSO attention algorithm is implemented through custom CUDA kernels, functions written in CUDA C++ that can be executed multiple times +in parallel on a GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_14.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..425a96062c9c977c21f486445ce04282307083eb --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_14.txt @@ -0,0 +1 @@ +The kernels provide a fast_hash function, which approximates the random projections of the queries and keys using the Fast Hadamard Transform. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_15.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3986d0fecf3d03fec7f477efbbac02fe15a95e36 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_15.txt @@ -0,0 +1,2 @@ +Using these +hash codes, the lsh_cumulation function approximates self-attention via LSH-based Bernoulli sampling. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_16.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..3571da0f1126bdd0cb307827633dec682f5c8edf --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_16.txt @@ -0,0 +1 @@ +To use the custom kernels, the user should set config.use_expectation = False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_17.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..f866740f6ca68a3eaba178e221cceae0f5cbc8d4 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_17.txt @@ -0,0 +1,2 @@ +To ensure that the kernels are compiled successfully, +the user must install the correct version of PyTorch and cudatoolkit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_18.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fe8b42bc24fbefa0ead5400e608ecfbc2792fbc --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_18.txt @@ -0,0 +1,2 @@ +By default, config.use_expectation = True, which uses YOSO-E and +does not require compiling CUDA kernels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_19.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2da172973647a7008a45eb959acb5aeea254d15 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_19.txt @@ -0,0 +1 @@ +YOSO Attention Algorithm. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_20.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6888f0b506f8c3d8b61d30a837695454b5e26ff --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_20.txt @@ -0,0 +1 @@ +Taken from the original paper. \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_21.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..6507eebfd9b6edc69f0ee7ea5a5cd908a2a040ee --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_21.txt @@ -0,0 +1,28 @@ +Resources + +Text classification task guide +Token classification task guide +Question answering task guide +Masked language modeling task guide +Multiple choice task guide + +YosoConfig +[[autodoc]] YosoConfig +YosoModel +[[autodoc]] YosoModel + - forward +YosoForMaskedLM +[[autodoc]] YosoForMaskedLM + - forward +YosoForSequenceClassification +[[autodoc]] YosoForSequenceClassification + - forward +YosoForMultipleChoice +[[autodoc]] YosoForMultipleChoice + - forward +YosoForTokenClassification +[[autodoc]] YosoForTokenClassification + - forward +YosoForQuestionAnswering +[[autodoc]] YosoForQuestionAnswering + - forward \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_8.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d903f33ac82326f4b33d7765c9b77cf2d5d73ca --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_8.txt @@ -0,0 +1,2 @@ +This leads to an efficient sampling scheme to estimate self-attention which relies on specific modifications of +LSH (to enable deployment on GPU architectures). \ No newline at end of file diff --git a/chunked/content_aware_chunking/model_doc_yoso/chunk_9.txt b/chunked/content_aware_chunking/model_doc_yoso/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..69c2c1db5fb30a92c7f373bba07192fecd024597 --- /dev/null +++ b/chunked/content_aware_chunking/model_doc_yoso/chunk_9.txt @@ -0,0 +1,2 @@ +We evaluate our algorithm on the GLUE benchmark with standard 512 sequence +length where we see favorable performance relative to a standard pretrained Transformer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_15.txt b/chunked/content_aware_chunking/tasks_asr/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb54a12b788c6744523d6aed7df8543dd99480f2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_15.txt @@ -0,0 +1 @@ +You can quickly load a evaluation method with the 🤗 Evaluate library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_16.txt b/chunked/content_aware_chunking/tasks_asr/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8521fe192f77f083e29ad6bdb4e6de78537a3f57 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_16.txt @@ -0,0 +1,19 @@ +For this task, load the word error rate (WER) metric (see the 🤗 Evaluate quick tour to learn more about how to load and compute a metric): + +import evaluate +wer = evaluate.load("wer") + +Then create a function that passes your predictions and labels to [~evaluate.EvaluationModule.compute] to calculate the WER: + +import numpy as np +def compute_metrics(pred): + pred_logits = pred.predictions + pred_ids = np.argmax(pred_logits, axis=-1) + + pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id + pred_str = processor.batch_decode(pred_ids) + label_str = processor.batch_decode(pred.label_ids, group_tokens=False) + wer = wer.compute(predictions=pred_str, references=label_str) + return {"wer": wer} + +Your compute_metrics function is ready to go now, and you'll return to it when you setup your training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_17.txt b/chunked/content_aware_chunking/tasks_asr/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f493faa6fd8490ca5e6198c99a80a5e6e236cf1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_17.txt @@ -0,0 +1,3 @@ +Train + +If you aren't familiar with finetuning a model with the [Trainer], take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_18.txt b/chunked/content_aware_chunking/tasks_asr/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..b606df7180c2e2be472d63ede6d17d92a9485791 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_18.txt @@ -0,0 +1 @@ +You're ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_19.txt b/chunked/content_aware_chunking/tasks_asr/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..172270d4eba22fcf06814be08c43870d7d7883e4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_19.txt @@ -0,0 +1 @@ +Load Wav2Vec2 with [AutoModelForCTC]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_20.txt b/chunked/content_aware_chunking/tasks_asr/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..e12f3bcb6007b7a5b2c9dfa52f7f34643be80d36 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_20.txt @@ -0,0 +1 @@ +Specify the reduction to apply with the ctc_loss_reduction parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_21.txt b/chunked/content_aware_chunking/tasks_asr/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..9355f5505d50098564de944c324ba11be49d2571 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_21.txt @@ -0,0 +1,12 @@ +It is often better to use the average instead of the default summation: + +from transformers import AutoModelForCTC, TrainingArguments, Trainer +model = AutoModelForCTC.from_pretrained( + "facebook/wav2vec2-base", + ctc_loss_reduction="mean", + pad_token_id=processor.tokenizer.pad_token_id, + ) + +At this point, only three steps remain: + +Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_22.txt b/chunked/content_aware_chunking/tasks_asr/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..be198e14862c0c12a30c53e206e6011dc146c468 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_22.txt @@ -0,0 +1 @@ +The only required parameter is output_dir which specifies where to save your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_23.txt b/chunked/content_aware_chunking/tasks_asr/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_23.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_24.txt b/chunked/content_aware_chunking/tasks_asr/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..93791e8d4f336ae0db24e78b75661d1a82e8f5fd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_24.txt @@ -0,0 +1 @@ +At the end of each epoch, the [Trainer] will evaluate the WER and save the training checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_25.txt b/chunked/content_aware_chunking/tasks_asr/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..643c2a1221f0298c2d0da3f12550f92d36f0b4a3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_25.txt @@ -0,0 +1 @@ +Pass the training arguments to [Trainer] along with the model, dataset, tokenizer, data collator, and compute_metrics function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_26.txt b/chunked/content_aware_chunking/tasks_asr/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_26.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_27.txt b/chunked/content_aware_chunking/tasks_asr/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb80a14a3e8a16a043146daaa89f36cb46e64080 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_27.txt @@ -0,0 +1,36 @@ +training_args = TrainingArguments( + output_dir="my_awesome_asr_mind_model", + per_device_train_batch_size=8, + gradient_accumulation_steps=2, + learning_rate=1e-5, + warmup_steps=500, + max_steps=2000, + gradient_checkpointing=True, + fp16=True, + group_by_length=True, + evaluation_strategy="steps", + per_device_eval_batch_size=8, + save_steps=1000, + eval_steps=1000, + logging_steps=25, + load_best_model_at_end=True, + metric_for_best_model="wer", + greater_is_better=False, + push_to_hub=True, + ) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=encoded_minds["train"], + eval_dataset=encoded_minds["test"], + tokenizer=processor, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) +trainer.train() + +Once training is completed, share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +For a more in-depth example of how to finetune a model for automatic speech recognition, take a look at this blog post for English ASR and this post for multilingual ASR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_28.txt b/chunked/content_aware_chunking/tasks_asr/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5ec9281024b79527faf7b28b94415e6100aee5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_28.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've finetuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_29.txt b/chunked/content_aware_chunking/tasks_asr/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..57db11b568d26ed043fa4f294784070dc46211e8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_29.txt @@ -0,0 +1 @@ +Load an audio file you'd like to run inference on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_30.txt b/chunked/content_aware_chunking/tasks_asr/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d316011b9c67ec2d713d9f7497eddfb28fb24ba --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_30.txt @@ -0,0 +1 @@ +Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_31.txt b/chunked/content_aware_chunking/tasks_asr/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..d117e0b9db7ff13896aa7139f4b315bf3efed872 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_31.txt @@ -0,0 +1,7 @@ +from datasets import load_dataset, Audio +dataset = load_dataset("PolyAI/minds14", "en-US", split="train") +dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) +sampling_rate = dataset.features["audio"].sampling_rate +audio_file = dataset[0]["audio"]["path"] + +The simplest way to try out your finetuned model for inference is to use it in a [pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_32.txt b/chunked/content_aware_chunking/tasks_asr/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6b99382bef84a008bb68f3d1c704c656308dacb --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_32.txt @@ -0,0 +1,8 @@ +Instantiate a pipeline for automatic speech recognition with your model, and pass your audio file to it: + +from transformers import pipeline +transcriber = pipeline("automatic-speech-recognition", model="stevhliu/my_awesome_asr_minds_model") +transcriber(audio_file) +{'text': 'I WOUD LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'} + +The transcription is decent, but it could be better! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_33.txt b/chunked/content_aware_chunking/tasks_asr/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fd6ea24e36e470a46be2c49a828bb4692cd004c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_33.txt @@ -0,0 +1 @@ +Try finetuning your model on more examples to get even better results! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_asr/chunk_34.txt b/chunked/content_aware_chunking/tasks_asr/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb048cf566c6c968fb873043ef47ef991d4c1060 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_asr/chunk_34.txt @@ -0,0 +1,22 @@ +You can also manually replicate the results of the pipeline if you'd like: + +Load a processor to preprocess the audio file and transcription and return the input as PyTorch tensors: + +from transformers import AutoProcessor +processor = AutoProcessor.from_pretrained("stevhliu/my_awesome_asr_mind_model") +inputs = processor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") + +Pass your inputs to the model and return the logits: + +from transformers import AutoModelForCTC +model = AutoModelForCTC.from_pretrained("stevhliu/my_awesome_asr_mind_model") +with torch.no_grad(): + logits = model(**inputs).logits + +Get the predicted input_ids with the highest probability, and use the processor to decode the predicted input_ids back into text: + +import torch +predicted_ids = torch.argmax(logits, dim=-1) +transcription = processor.batch_decode(predicted_ids) +transcription +['I WOUL LIKE O SET UP JOINT ACOUNT WTH Y PARTNER'] \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_14.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7af2802b13b5b374b45890f0efd0257cd8a8dc4d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_14.txt @@ -0,0 +1,13 @@ +For this task, load the accuracy metric (see the 🤗 Evaluate quick tour to learn more about how to load and compute a metric): + +import evaluate +accuracy = evaluate.load("accuracy") + +Then create a function that passes your predictions and labels to [~evaluate.EvaluationModule.compute] to calculate the accuracy: + +import numpy as np +def compute_metrics(eval_pred): + predictions = np.argmax(eval_pred.predictions, axis=1) + return accuracy.compute(predictions=predictions, references=eval_pred.label_ids) + +Your compute_metrics function is ready to go now, and you'll return to it when you setup your training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_15.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f493faa6fd8490ca5e6198c99a80a5e6e236cf1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_15.txt @@ -0,0 +1,3 @@ +Train + +If you aren't familiar with finetuning a model with the [Trainer], take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_16.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..b606df7180c2e2be472d63ede6d17d92a9485791 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_16.txt @@ -0,0 +1 @@ +You're ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_17.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8ff05a1cf23273213d151484063014e5d6cb96b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_17.txt @@ -0,0 +1,11 @@ +Load Wav2Vec2 with [AutoModelForAudioClassification] along with the number of expected labels, and the label mappings: + +from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer +num_labels = len(id2label) +model = AutoModelForAudioClassification.from_pretrained( + "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label + ) + +At this point, only three steps remain: + +Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_18.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..be198e14862c0c12a30c53e206e6011dc146c468 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_18.txt @@ -0,0 +1 @@ +The only required parameter is output_dir which specifies where to save your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_19.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_19.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_20.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c977d29a14cc4699a4b06a477bb52c2490bdc8e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_20.txt @@ -0,0 +1 @@ +At the end of each epoch, the [Trainer] will evaluate the accuracy and save the training checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_21.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..643c2a1221f0298c2d0da3f12550f92d36f0b4a3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_21.txt @@ -0,0 +1 @@ +Pass the training arguments to [Trainer] along with the model, dataset, tokenizer, data collator, and compute_metrics function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_22.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_22.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_23.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..242a78bbedcc4ce673819427d36975831ea2280f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_23.txt @@ -0,0 +1,30 @@ +training_args = TrainingArguments( + output_dir="my_awesome_mind_model", + evaluation_strategy="epoch", + save_strategy="epoch", + learning_rate=3e-5, + per_device_train_batch_size=32, + gradient_accumulation_steps=4, + per_device_eval_batch_size=32, + num_train_epochs=10, + warmup_ratio=0.1, + logging_steps=10, + load_best_model_at_end=True, + metric_for_best_model="accuracy", + push_to_hub=True, + ) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=encoded_minds["train"], + eval_dataset=encoded_minds["test"], + tokenizer=feature_extractor, + compute_metrics=compute_metrics, + ) +trainer.train() + +Once training is completed, share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +For a more in-depth example of how to finetune a model for audio classification, take a look at the corresponding PyTorch notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_24.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5ec9281024b79527faf7b28b94415e6100aee5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_24.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've finetuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_25.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..57db11b568d26ed043fa4f294784070dc46211e8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_25.txt @@ -0,0 +1 @@ +Load an audio file you'd like to run inference on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_26.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d316011b9c67ec2d713d9f7497eddfb28fb24ba --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_26.txt @@ -0,0 +1 @@ +Remember to resample the sampling rate of the audio file to match the sampling rate of the model if you need to! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_27.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..97176846e4a9350f5153e9487b4def8096b050e5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_27.txt @@ -0,0 +1,7 @@ +from datasets import load_dataset, Audio +dataset = load_dataset("PolyAI/minds14", name="en-US", split="train") +dataset = dataset.cast_column("audio", Audio(sampling_rate=16000)) +sampling_rate = dataset.features["audio"].sampling_rate +audio_file = dataset[0]["audio"]["path"] + +The simplest way to try out your finetuned model for inference is to use it in a [pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_audio_classification/chunk_28.txt b/chunked/content_aware_chunking/tasks_audio_classification/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1b43e058c72af966cef100000be94778c1e478c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_audio_classification/chunk_28.txt @@ -0,0 +1,35 @@ +Instantiate a pipeline for audio classification with your model, and pass your audio file to it: + +from transformers import pipeline +classifier = pipeline("audio-classification", model="stevhliu/my_awesome_minds_model") +classifier(audio_file) +[ + {'score': 0.09766869246959686, 'label': 'cash_deposit'}, + {'score': 0.07998877018690109, 'label': 'app_error'}, + {'score': 0.0781070664525032, 'label': 'joint_account'}, + {'score': 0.07667109370231628, 'label': 'pay_bill'}, + {'score': 0.0755252093076706, 'label': 'balance'} +] + +You can also manually replicate the results of the pipeline if you'd like: + +Load a feature extractor to preprocess the audio file and return the input as PyTorch tensors: + +from transformers import AutoFeatureExtractor +feature_extractor = AutoFeatureExtractor.from_pretrained("stevhliu/my_awesome_minds_model") +inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt") + +Pass your inputs to the model and return the logits: + +from transformers import AutoModelForAudioClassification +model = AutoModelForAudioClassification.from_pretrained("stevhliu/my_awesome_minds_model") +with torch.no_grad(): + logits = model(**inputs).logits + +Get the class with the highest probability, and use the model's id2label mapping to convert it to a label: + +import torch +predicted_class_ids = torch.argmax(logits).item() +predicted_label = model.config.id2label[predicted_class_ids] +predicted_label +'cash_deposit' \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_100.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..5608f0e93336431485ef9ded2fe660a0946c862f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_100.txt @@ -0,0 +1 @@ +Forward the result or preprocessing through the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_101.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_101.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_102.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea06c34e361591bc31e7cc0783970bd1a7f7d757 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_102.txt @@ -0,0 +1,2 @@ +The model returns start_logits and end_logits, which indicate which token is at the start of the answer and +which token is at the end of the answer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_103.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..db30827d28b8e239d3de46a816ab14859cea46a1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_103.txt @@ -0,0 +1 @@ +Both have shape (batch_size, sequence_length). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_104.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd249e29f9fefd5ef6445828f1394f228bd97f1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_104.txt @@ -0,0 +1 @@ +4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_105.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..e12a919dc960e022f7a42bdd707955056ae9e908 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_105.txt @@ -0,0 +1 @@ +Take an argmax on the last dimension of both the start_logits and end_logits to get the predicted start_idx and end_idx. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_106.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..91dcb6a87071975adc555db222107a0056de804e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_106.txt @@ -0,0 +1 @@ +5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_107.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..38824e4d2737c2a627850702671a6ad0a1369b91 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_107.txt @@ -0,0 +1 @@ +Decode the answer with the tokenizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_108.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..775c3ac31210e0add3a732d58796d92f5d3ecad4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_108.txt @@ -0,0 +1,14 @@ +import torch +from transformers import AutoProcessor +from transformers import AutoModelForDocumentQuestionAnswering +processor = AutoProcessor.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa") +model = AutoModelForDocumentQuestionAnswering.from_pretrained("MariaK/layoutlmv2-base-uncased_finetuned_docvqa") +with torch.no_grad(): + encoding = processor(image.convert("RGB"), question, return_tensors="pt") + outputs = model(**encoding) + start_logits = outputs.start_logits + end_logits = outputs.end_logits + predicted_start_idx = start_logits.argmax(-1).item() + predicted_end_idx = end_logits.argmax(-1).item() +processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]) +'lee a. waller' \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_23.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..2db969fb7f3d50704e885c3e17569b17b2be170e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_23.txt @@ -0,0 +1,2 @@ +Instead, we can use the [LayoutLMv2Processor] on the original data for both OCR and +tokenization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_24.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..16c83292035a969f98a6b502991a4bace793ab18 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_24.txt @@ -0,0 +1 @@ +This way we'll get the inputs that match model's expected input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_25.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..945fbb3d365dc05c3ef69d64888527a67a3e7722 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_25.txt @@ -0,0 +1,2 @@ +If you want to process images manually, +check out the LayoutLMv2 model documentation to learn what input format the model expects. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_26.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9818ba105027787a1d2c11a3191c4f19eaadf25 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_26.txt @@ -0,0 +1,4 @@ +updated_dataset = updated_dataset.remove_columns("words") +updated_dataset = updated_dataset.remove_columns("bounding_boxes") + +Finally, the data exploration won't be complete if we don't peek at an image example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_27.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..1088bdab44c752044e9100d2ea55e7a0c56ed9cd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_27.txt @@ -0,0 +1,5 @@ +updated_dataset["train"][11]["image"] + +Preprocess the data +The Document Question Answering task is a multimodal task, and you need to make sure that the inputs from each modality +are preprocessed according to the model's expectations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_28.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcc2f7b5c48e069a916f5898f65cc70c34e01f35 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_28.txt @@ -0,0 +1 @@ +Let's start by loading the [LayoutLMv2Processor], which internally combines an image processor that can handle image data and a tokenizer that can encode text data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_29.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..86857501b16562977c3ed2e2735ade51fd1d9cf8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_29.txt @@ -0,0 +1,5 @@ +from transformers import AutoProcessor +processor = AutoProcessor.from_pretrained(model_checkpoint) + +Preprocessing document images +First, let's prepare the document images for the model with the help of the image_processor from the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_30.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..7efcfe7902b68c9b2522d143dae1725e35d3d7be --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_30.txt @@ -0,0 +1,2 @@ +By default, image processor resizes the images to 224x224, makes sure they have the correct order of color channels, +applies OCR with tesseract to get words and normalized bounding boxes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_31.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..40bf0eacd023cbb41364c280dfcad1e0d13d18f5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_31.txt @@ -0,0 +1 @@ +In this tutorial, all of these defaults are exactly what we need. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_32.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..63be5218fe33871ccaae5e0e6719514f6eef6eba --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_32.txt @@ -0,0 +1 @@ +Write a function that applies the default image processing to a batch of images and returns the results of OCR. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_33.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..36246295ec14647220b7c14c11772cabf512f73c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_33.txt @@ -0,0 +1,11 @@ +image_processor = processor.image_processor +def get_ocr_words_and_boxes(examples): + images = [image.convert("RGB") for image in examples["image"]] + encoded_inputs = image_processor(images) + + examples["image"] = encoded_inputs.pixel_values + examples["words"] = encoded_inputs.words + examples["boxes"] = encoded_inputs.boxes + return examples + +To apply this preprocessing to the entire dataset in a fast way, use [~datasets.Dataset.map]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_34.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9ae602e9fdf7e0208bd5435e056556458a56f10 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_34.txt @@ -0,0 +1,4 @@ +dataset_with_ocr = updated_dataset.map(get_ocr_words_and_boxes, batched=True, batch_size=2) + +Preprocessing text data +Once we have applied OCR to the images, we need to encode the text part of the dataset to prepare it for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_35.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa7cfa5ee6a3eae0d52f37475c524be80c899f21 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_35.txt @@ -0,0 +1,2 @@ +This involves converting the words and boxes that we got in the previous step to token-level input_ids, attention_mask, +token_type_ids and bbox. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_36.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..e24b2398e905d111342378097d0805da21e6756d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_36.txt @@ -0,0 +1 @@ +For preprocessing text, we'll need the tokenizer from the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_37.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc7374f98e33f237e29cfb2c9047be2452eb6ff8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_37.txt @@ -0,0 +1,3 @@ +tokenizer = processor.tokenizer + +On top of the preprocessing mentioned above, we also need to add the labels for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_38.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae963fdbef23ce78381861e9afbdb2ffe9409ec5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_38.txt @@ -0,0 +1,3 @@ +For xxxForQuestionAnswering models +in 🤗 Transformers, the labels consist of the start_positions and end_positions, indicating which token is at the +start and which token is at the end of the answer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_39.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..18e82d7daf3a14743ff4ccb8b1b2c73d67f2fa16 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_39.txt @@ -0,0 +1 @@ +Let's start with that. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_40.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..993676cb6ecff2b37b83935c2e2f10f892a4b69d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_40.txt @@ -0,0 +1 @@ +Define a helper function that can find a sublist (the answer split into words) in a larger list (the words list). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_41.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..14e0e9f8c71b4e10f538ec40f1128eed4185bcea --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_41.txt @@ -0,0 +1 @@ +This function will take two lists as input, words_list and answer_list. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_42.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..801c5621322241d07860473cc08913967f5f2f29 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_42.txt @@ -0,0 +1,3 @@ +It will then iterate over the words_list and check +if the current word in the words_list (words_list[i]) is equal to the first word of answer_list (answer_list[0]) and if +the sublist of words_list starting from the current word and of the same length as answer_list is equal to answer_list. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_43.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..08da9987debbc24697b38883b22dd3328614a14e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_43.txt @@ -0,0 +1,2 @@ +If this condition is true, it means that a match has been found, and the function will record the match, its starting index (idx), +and its ending index (idx + len(answer_list) - 1). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_44.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ab80cad2d1dfd6ede5fb43e63b05a63d05a2f1e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_44.txt @@ -0,0 +1 @@ +If more than one match was found, the function will return only the first one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_45.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..e55f3a3b933f0ccd04edfd5c64badc64acf1f624 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_45.txt @@ -0,0 +1 @@ +If no match is found, the function returns (None, 0, and 0). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_46.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..641ce1ee174c1bc0f14c5cd95307079225a27f28 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_46.txt @@ -0,0 +1,25 @@ +def subfinder(words_list, answer_list): + matches = [] + start_indices = [] + end_indices = [] + for idx, i in enumerate(range(len(words_list))): + if words_list[i] == answer_list[0] and words_list[i : i + len(answer_list)] == answer_list: + matches.append(answer_list) + start_indices.append(idx) + end_indices.append(idx + len(answer_list) - 1) + if matches: + return matches[0], start_indices[0], end_indices[0] + else: + return None, 0, 0 + +To illustrate how this function finds the position of the answer, let's use it on an example: + +example = dataset_with_ocr["train"][1] +words = [word.lower() for word in example["words"]] +match, word_idx_start, word_idx_end = subfinder(words, example["answer"].lower().split()) +print("Question: ", example["question"]) +print("Words:", words) +print("Answer: ", example["answer"]) +print("start_index", word_idx_start) +print("end_index", word_idx_end) +Question: Who is in cc in this letter? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_47.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..b04d1cce6a7e265e12d6cc150f873778358b7914 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_47.txt @@ -0,0 +1 @@ +Words: ['wie', 'baw', 'brown', '&', 'williamson', 'tobacco', 'corporation', 'research', '&', 'development', 'internal', 'correspondence', 'to:', 'r. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_48.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e9b967c9b5bc82ba5c7c9d303d11780b653c23e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_48.txt @@ -0,0 +1 @@ +', 'h. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_49.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ce85ef334b693c5c02005c053951e448838dcae --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_49.txt @@ -0,0 +1 @@ +', 'honeycutt', 'ce:', 't.f. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_50.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..e84ce2923d039a286ded29580c2d71d47fcdcc5b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_50.txt @@ -0,0 +1 @@ +', 'riehl', 'from:', '. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_51.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..5da15cb723d7cc573d914124e0609cf677e5b150 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_51.txt @@ -0,0 +1 @@ +', 'c.j. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_52.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..0dfd865c82572d0ebab8d1bd4635f812931a33e0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_52.txt @@ -0,0 +1 @@ +', 'cook', 'date:', 'may', '8,', '1995', 'subject:', 'review', 'of', 'existing', 'brainstorming', 'ideas/483', 'the', 'major', 'function', 'of', 'the', 'product', 'innovation', 'graup', 'is', 'to', 'develop', 'marketable', 'nove! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_53.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffa854b0771843bd8225a7ecd2ba517bbbe9f8ca --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_53.txt @@ -0,0 +1 @@ +', 'products', 'that', 'would', 'be', 'profitable', 'to', 'manufacture', 'and', 'sell. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_54.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..8eabd46de017aa6d5b291a31b806dcd62e95b2c1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_54.txt @@ -0,0 +1 @@ +', 'novel', 'is', 'defined', 'as:', 'of', 'a', 'new', 'kind,', 'or', 'different', 'from', 'anything', 'seen', 'or', 'known', 'before. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_55.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb44d15e60ea1ae582693cd6037fe6d27b4b14ed --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_55.txt @@ -0,0 +1 @@ +', 'innovation', 'is', 'defined', 'as:', 'something', 'new', 'or', 'different', 'introduced;', 'act', 'of', 'innovating;', 'introduction', 'of', 'new', 'things', 'or', 'methods. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_56.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbf1554713522f676cd2888f0ad2ef2cb0f0d8c0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_56.txt @@ -0,0 +1 @@ +', 'the', 'products', 'may', 'incorporate', 'the', 'latest', 'technologies,', 'materials', 'and', 'know-how', 'available', 'to', 'give', 'then', 'a', 'unique', 'taste', 'or', 'look. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_57.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..56f713fcdac10eb8e512b624027d7294a6bfec7f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_57.txt @@ -0,0 +1 @@ +', 'the', 'first', 'task', 'of', 'the', 'product', 'innovation', 'group', 'was', 'to', 'assemble,', 'review', 'and', 'categorize', 'a', 'list', 'of', 'existing', 'brainstorming', 'ideas. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_58.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..62f5207df522346ecbdecd55b5f6d9df07e6fdd7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_58.txt @@ -0,0 +1 @@ +', 'ideas', 'were', 'grouped', 'into', 'two', 'major', 'categories', 'labeled', 'appearance', 'and', 'taste/aroma. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_59.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..20fe93def0bd7e78b6134844fad1e8d968886d42 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_59.txt @@ -0,0 +1 @@ +', 'these', 'categories', 'are', 'used', 'for', 'novel', 'products', 'that', 'may', 'differ', 'from', 'a', 'visual', 'and/or', 'taste/aroma', 'point', 'of', 'view', 'compared', 'to', 'canventional', 'cigarettes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_60.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..e92220bcefd85b33fd5504747ff4e73aed56e9a7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_60.txt @@ -0,0 +1 @@ +', 'other', 'categories', 'include', 'a', 'combination', 'of', 'the', 'above,', 'filters,', 'packaging', 'and', 'brand', 'extensions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_61.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..e990ff66407cf1c0c68136e3fed0daf2000817ea --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_61.txt @@ -0,0 +1 @@ +', 'appearance', 'this', 'category', 'is', 'used', 'for', 'novel', 'cigarette', 'constructions', 'that', 'yield', 'visually', 'different', 'products', 'with', 'minimal', 'changes', 'in', 'smoke', 'chemistry', 'two', 'cigarettes', 'in', 'cne. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_62.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..263372e64b8f6ef487c893de5dbf3668e4e0ea9a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_62.txt @@ -0,0 +1 @@ +', 'emulti-plug', 'te', 'build', 'yaur', 'awn', 'cigarette. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_63.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..50a0cbf47d85a04b67cc83ae90ed2429ca3ab025 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_63.txt @@ -0,0 +1 @@ +', 'eswitchable', 'menthol', 'or', 'non', 'menthol', 'cigarette. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_64.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..31d4988c24a96e3b1a45e59d5cc260f8fe4555dd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_64.txt @@ -0,0 +1 @@ +', 'cigarettes', 'with', 'interspaced', 'perforations', 'to', 'enable', 'smoker', 'to', 'separate', 'unburned', 'section', 'for', 'future', 'smoking. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_65.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..85fa7c26b353a9bf79b0fb5af29aff999c3c81c2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_65.txt @@ -0,0 +1 @@ +', '«short', 'cigarette,', 'tobacco', 'section', '30', 'mm. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_66.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e119b2b12037d3a0a92fdb3bc7b644aee69ccbe --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_66.txt @@ -0,0 +1 @@ +', '«extremely', 'fast', 'buming', 'cigarette. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_67.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..c010b86ebd31948b94f8d164e760413dd4d5b915 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_67.txt @@ -0,0 +1 @@ +', '«novel', 'cigarette', 'constructions', 'that', 'permit', 'a', 'significant', 'reduction', 'iretobacco', 'weight', 'while', 'maintaining', 'smoking', 'mechanics', 'and', 'visual', 'characteristics. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_68.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..556136b8862266864d4e50476e343857b4120db9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_68.txt @@ -0,0 +1 @@ +', 'higher', 'basis', 'weight', 'paper:', 'potential', 'reduction', 'in', 'tobacco', 'weight. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_69.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..2354c9a60a90bd89fa21cf080bfbccac3f6c364c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_69.txt @@ -0,0 +1 @@ +', '«more', 'rigid', 'tobacco', 'column;', 'stiffing', 'agent', 'for', 'tobacco;', 'e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_70.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3ce30b4bcc19432385874b9495c94f02f992aa8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_70.txt @@ -0,0 +1 @@ +', 'starch', 'colored', 'tow', 'and', 'cigarette', 'papers;', 'seasonal', 'promotions,', 'e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_71.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b6da648a9d0c86560f78ac8dfa344974f7c2208 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_71.txt @@ -0,0 +1 @@ +', 'pastel', 'colored', 'cigarettes', 'for', 'easter', 'or', 'in', 'an', 'ebony', 'and', 'ivory', 'brand', 'containing', 'a', 'mixture', 'of', 'all', 'black', '(black', 'paper', 'and', 'tow)', 'and', 'ail', 'white', 'cigarettes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_72.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb262019e14e7f543f62e7771aac98cc546de0d8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_72.txt @@ -0,0 +1,2 @@ +', '499150498'] +Answer: T.F. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_73.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce767ed3fadef3ee9789c0a9d4efdd8492e42272 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_73.txt @@ -0,0 +1,9 @@ +Riehl +start_index 17 +end_index 18 + +Once examples are encoded, however, they will look like this: + +encoding = tokenizer(example["question"], example["words"], example["boxes"]) +tokenizer.decode(encoding["input_ids"]) +[CLS] who is in cc in this letter? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_74.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ccc9218170c6081f70cec15e3b709f0075ff533 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_74.txt @@ -0,0 +1,3 @@ +[SEP] wie baw brown & williamson tobacco corporation research & development + +We'll need to find the position of the answer in the encoded input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_75.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..77bbbbb307b3739496d8a90843366c52f523a180 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_75.txt @@ -0,0 +1 @@ +* token_type_ids tells us which tokens are part of the question, and which ones are part of the document's words. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_76.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..55f3037976de675d616273c8cd04399d3f2a96a0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_76.txt @@ -0,0 +1 @@ +* tokenizer.cls_token_id will help find the special token at the beginning of the input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_77.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..80169cf3535c62102f6a5a2c4afeebb92e3ed9a1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_77.txt @@ -0,0 +1,2 @@ +* word_ids will help match the answer found in the original words to the same answer in the full encoded input and determine +the start/end position of the answer in the encoded input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_78.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..a531de555bbee5274812e11d7def43275ceaf720 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_78.txt @@ -0,0 +1,76 @@ +With that in mind, let's create a function to encode a batch of examples in the dataset: + +def encode_dataset(examples, max_length=512): + questions = examples["question"] + words = examples["words"] + boxes = examples["boxes"] + answers = examples["answer"] + + # encode the batch of examples and initialize the start_positions and end_positions + encoding = tokenizer(questions, words, boxes, max_length=max_length, padding="max_length", truncation=True) + start_positions = [] + end_positions = [] + # loop through the examples in the batch + for i in range(len(questions)): + cls_index = encoding["input_ids"][i].index(tokenizer.cls_token_id) + # find the position of the answer in example's words + words_example = [word.lower() for word in words[i]] + answer = answers[i] + match, word_idx_start, word_idx_end = subfinder(words_example, answer.lower().split()) + if match: + # if match is found, use token_type_ids to find where words start in the encoding + token_type_ids = encoding["token_type_ids"][i] + token_start_index = 0 + while token_type_ids[token_start_index] != 1: + token_start_index += 1 + token_end_index = len(encoding["input_ids"][i]) - 1 + while token_type_ids[token_end_index] != 1: + token_end_index -= 1 + word_ids = encoding.word_ids(i)[token_start_index : token_end_index + 1] + start_position = cls_index + end_position = cls_index + # loop over word_ids and increase token_start_index until it matches the answer position in words + # once it matches, save the token_start_index as the start_position of the answer in the encoding + for id in word_ids: + if id == word_idx_start: + start_position = token_start_index + else: + token_start_index += 1 + # similarly loop over word_ids starting from the end to find the end_position of the answer + for id in word_ids[::-1]: + if id == word_idx_end: + end_position = token_end_index + else: + token_end_index -= 1 + start_positions.append(start_position) + end_positions.append(end_position) + else: + start_positions.append(cls_index) + end_positions.append(cls_index) + encoding["image"] = examples["image"] + encoding["start_positions"] = start_positions + encoding["end_positions"] = end_positions + return encoding + +Now that we have this preprocessing function, we can encode the entire dataset: + +encoded_train_dataset = dataset_with_ocr["train"].map( + encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["train"].column_names + ) +encoded_test_dataset = dataset_with_ocr["test"].map( + encode_dataset, batched=True, batch_size=2, remove_columns=dataset_with_ocr["test"].column_names + ) + +Let's check what the features of the encoded dataset look like: + +encoded_train_dataset.features +{'image': Sequence(feature=Sequence(feature=Sequence(feature=Value(dtype='uint8', id=None), length=-1, id=None), length=-1, id=None), length=-1, id=None), + 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), + 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), + 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), + 'bbox': Sequence(feature=Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), length=-1, id=None), + 'start_positions': Value(dtype='int64', id=None), + 'end_positions': Value(dtype='int64', id=None)} + +Evaluation +Evaluation for document question answering requires a significant amount of postprocessing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_79.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..93465fb1f8beeb367b500f970e440a0f65a144db --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_79.txt @@ -0,0 +1,2 @@ +To avoid taking up too much +of your time, this guide skips the evaluation step. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_80.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..c166c52973b0d103491706af4900f89c671df844 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_80.txt @@ -0,0 +1,2 @@ +The [Trainer] still calculates the evaluation loss during training so +you're not completely in the dark about your model's performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_81.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1a7e49c22fa294c061d9c6d4c81b671d7338e91 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_81.txt @@ -0,0 +1 @@ +Extractive question answering is typically evaluated using F1/exact match. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_82.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..670454f5c2911ff3b8584f625c9c6ba083d35bc3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_82.txt @@ -0,0 +1,2 @@ +If you'd like to implement it yourself, check out the Question Answering chapter +of the Hugging Face course for inspiration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_83.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..b966c0f4d1e60fdb73a68f9e49c57762b0fdc619 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_83.txt @@ -0,0 +1,2 @@ +Train +Congratulations! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_84.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..b34e9dbdeb2ee7c0d3c8acb8689b5df6857657a5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_84.txt @@ -0,0 +1 @@ +You've successfully navigated the toughest part of this guide and now you are ready to train your own model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_85.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..4461b33c86e2b3cb26555088be1ee2f32081b2de --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_85.txt @@ -0,0 +1,2 @@ +Training involves the following steps: +* Load the model with [AutoModelForDocumentQuestionAnswering] using the same checkpoint as in the preprocessing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_86.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..612a454da762d61c73a2e212879557bbd2928cba --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_86.txt @@ -0,0 +1 @@ +* Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_87.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4d45b35fb106440fa55202878c97facc7736824 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_87.txt @@ -0,0 +1,2 @@ +* Define a function to batch examples together, here the [DefaultDataCollator] will do just fine +* Pass the training arguments to [Trainer] along with the model, dataset, and data collator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_88.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..a77f7d6e80d4a57cbd26ad5f530befbee7f1f3eb --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_88.txt @@ -0,0 +1 @@ +* Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_89.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..d60d662216418fdd0a1f1a0f61af405b04c9cf6f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_89.txt @@ -0,0 +1,4 @@ +from transformers import AutoModelForDocumentQuestionAnswering +model = AutoModelForDocumentQuestionAnswering.from_pretrained(model_checkpoint) + +In the [TrainingArguments] use output_dir to specify where to save your model, and configure hyperparameters as you see fit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_90.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..2343692646ab4bacab53d716bdb58e79db9937b9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_90.txt @@ -0,0 +1 @@ +If you wish to share your model with the community, set push_to_hub to True (you must be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_91.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d53db1f2f9cf21eca6497aced5cb02fd849393b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_91.txt @@ -0,0 +1 @@ +In this case the output_dir will also be the name of the repo where your model checkpoint will be pushed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_92.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..3382c05b6e7c73aee9bab9062e43452f2048113f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_92.txt @@ -0,0 +1,17 @@ +from transformers import TrainingArguments +REPLACE THIS WITH YOUR REPO ID +repo_id = "MariaK/layoutlmv2-base-uncased_finetuned_docvqa" +training_args = TrainingArguments( + output_dir=repo_id, + per_device_train_batch_size=4, + num_train_epochs=20, + save_steps=200, + logging_steps=50, + evaluation_strategy="steps", + learning_rate=5e-5, + save_total_limit=2, + remove_unused_columns=False, + push_to_hub=True, + ) + +Define a simple data collator to batch examples together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_93.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b350a20aefbb2029ee851de3581569d5751e9bf --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_93.txt @@ -0,0 +1,23 @@ +from transformers import DefaultDataCollator +data_collator = DefaultDataCollator() + +Finally, bring everything together, and call [~Trainer.train]: + +from transformers import Trainer +trainer = Trainer( + model=model, + args=training_args, + data_collator=data_collator, + train_dataset=encoded_train_dataset, + eval_dataset=encoded_test_dataset, + tokenizer=processor, + ) +trainer.train() + +To add the final model to 🤗 Hub, create a model card and call push_to_hub: + +trainer.create_model_card() +trainer.push_to_hub() + +Inference +Now that you have finetuned a LayoutLMv2 model, and uploaded it to the 🤗 Hub, you can use it for inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_94.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ddc619b653dad7d0ba9bc12d416e144fc6768fd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_94.txt @@ -0,0 +1,2 @@ +The simplest +way to try out your finetuned model for inference is to use it in a [Pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_95.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9ac57d40d8b9ae1f8c6d038e0e80fb4511b56ff --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_95.txt @@ -0,0 +1,8 @@ +Let's take an example: + +example = dataset["test"][2] +question = example["query"]["en"] +image = example["image"] +print(question) +print(example["answers"]) +'Who is ‘presiding’ TRRF GENERAL SESSION (PART 1)?' \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_96.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..766263477795548357b387fcd89a7694d0c2db4d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_96.txt @@ -0,0 +1,4 @@ +['TRRF Vice President', 'lee a. waller'] + +Next, instantiate a pipeline for +document question answering with your model, and pass the image + question combination to it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_97.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d65a87f826e1301171bd7c41901d6bbae1c5ba7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_97.txt @@ -0,0 +1,10 @@ +from transformers import pipeline +qa_pipeline = pipeline("document-question-answering", model="MariaK/layoutlmv2-base-uncased_finetuned_docvqa") +qa_pipeline(image, question) +[{'score': 0.9949808120727539, + 'answer': 'Lee A. Waller', + 'start': 55, + 'end': 57}] + +You can also manually replicate the results of the pipeline if you'd like: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_98.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c51f9219a2f8a10c35c23572649f04bd47b3e1b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_98.txt @@ -0,0 +1 @@ +Take an image and a question, prepare them for the model using the processor from your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_document_question_answering/chunk_99.txt b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_document_question_answering/chunk_99.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_27.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfbe555083a4f9f2283a28edf8a71fb47fae2399 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_27.txt @@ -0,0 +1,13 @@ +prompt = [ + "https://images.unsplash.com/photo-1583160247711-2191776b4b91?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3542&q=80", + ] +inputs = processor(prompt, return_tensors="pt").to("cuda") +bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids +generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +print(generated_text[0]) +A puppy in a flower bed + +It is a good idea to include the bad_words_ids in the call to generate to avoid errors arising when increasing +the max_new_tokens: the model will want to generate a new or token when there +is no image being generated by the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_28.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..14786cb12e8d1314569ae09e4477e737e00cdf65 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_28.txt @@ -0,0 +1 @@ +You can set it on-the-fly as in this guide, or store in the GenerationConfig as described in the Text generation strategies guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_29.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c9252a3b632e08a757f9b219bb13d4709a63fa5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_29.txt @@ -0,0 +1,2 @@ +Prompted image captioning +You can extend image captioning by providing a text prompt, which the model will continue given the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_30.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..42e54b11762d3524510fbdbee6c00778c1b2261b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_30.txt @@ -0,0 +1,4 @@ +Let's take +another image to illustrate: + +Photo by Denys Nevozhai. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_31.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..82492b919224fd99c89683b3a39153c8418cd22f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_31.txt @@ -0,0 +1 @@ +Textual and image prompts can be passed to the model's processor as a single list to create appropriate inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_32.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..44048c1cb93633aaa8af93ee675d458ea791c372 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_32.txt @@ -0,0 +1,10 @@ +prompt = [ + "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80", + "This is an image of ", + ] +inputs = processor(prompt, return_tensors="pt").to("cuda") +bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids +generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +print(generated_text[0]) +This is an image of the Eiffel Tower in Paris, France. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_33.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..28e84dd641de50a1fc6083d922c36c5b1a917adf --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_33.txt @@ -0,0 +1,3 @@ +Few-shot prompting +While IDEFICS demonstrates great zero-shot results, your task may require a certain format of the caption, or come with +other restrictions or requirements that increase task's complexity. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_34.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..da1caa2b27e4091a83e23d22be9f56351bd8748b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_34.txt @@ -0,0 +1 @@ +Few-shot prompting can be used to enable in-context learning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_35.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..d90dc386317d87fbb1d58e15a2728a36277332e3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_35.txt @@ -0,0 +1 @@ +By providing examples in the prompt, you can steer the model to generate results that mimic the format of given examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_36.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..75c15abc68d5ef3df2e06144bbb9dbf88c7ed0ef --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_36.txt @@ -0,0 +1,2 @@ +Let's use the previous image of the Eiffel Tower as an example for the model and build a prompt that demonstrates to the model +that in addition to learning what the object in an image is, we would also like to get some interesting information about it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_37.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..b933a89b8f1c65424ee8bbfe02642df53f894137 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_37.txt @@ -0,0 +1,3 @@ +Then, let's see, if we can get the same response format for an image of the Statue of Liberty: + +Photo by Juan Mayobre. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_38.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..26d828f8d78ae8aafd3833c6571f45503d6d669b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_38.txt @@ -0,0 +1,3 @@ +prompt = ["User:", + "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80", + "Describe this image.\nAssistant: An image of the Eiffel Tower at night. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_39.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..55b14ff89bbbcaf7ec9ea89a2d87cf10727ec85a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_39.txt @@ -0,0 +1,11 @@ +Fun fact: the Eiffel Tower is the same height as an 81-storey building.\n", + "User:", + "https://images.unsplash.com/photo-1524099163253-32b7f0256868?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3387&q=80", + "Describe this image.\nAssistant:" + ] +inputs = processor(prompt, return_tensors="pt").to("cuda") +bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids +generated_ids = model.generate(**inputs, max_new_tokens=30, bad_words_ids=bad_words_ids) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +print(generated_text[0]) +User: Describe this image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_40.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..c71c460fcef02ef9c508accd6e1f851e5dd70d3b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_40.txt @@ -0,0 +1 @@ +Assistant: An image of the Eiffel Tower at night. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_41.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..975ce46ecacabfbfeda136e602f8103b0136c967 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_41.txt @@ -0,0 +1 @@ +Fun fact: the Eiffel Tower is the same height as an 81-storey building. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_42.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..b10bc03a873a7006a2fbc80cc8e73be1cbb5e02b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_42.txt @@ -0,0 +1 @@ +User: Describe this image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_43.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e58cbc42bdca3f9f6f6739b0516cae4639432fb --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_43.txt @@ -0,0 +1 @@ +Assistant: An image of the Statue of Liberty. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_44.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..4efcb039ab9f04108b197172d04d5dd2ce3ee413 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_44.txt @@ -0,0 +1 @@ +Fun fact: the Statue of Liberty is 151 feet tall. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_45.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..6fb5e9c90aef6b51dd50788391f12647db8e8c44 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_45.txt @@ -0,0 +1 @@ +Notice that just from a single example (i.e., 1-shot) the model has learned how to perform the task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_46.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1029e982fe08af414caeb632203c41594d030e4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_46.txt @@ -0,0 +1,2 @@ +For more complex tasks, +feel free to experiment with a larger number of examples (e.g., 3-shot, 5-shot, etc.). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_47.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..89ddf60bc15f10d57b3fae0a5fdb2b245ad5f4ea --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_47.txt @@ -0,0 +1,2 @@ +Visual question answering +Visual Question Answering (VQA) is the task of answering open-ended questions based on an image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_48.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..9961463523087e7abf2788fc7191a778b29aa774 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_48.txt @@ -0,0 +1,3 @@ +Similar to image +captioning it can be used in accessibility applications, but also in education (reasoning about visual materials), customer +service (questions about products based on images), and image retrieval. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_49.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..b55a4fb72bcba7dd359deb347507c9407cee55f2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_49.txt @@ -0,0 +1,3 @@ +Let's get a new image for this task: + +Photo by Jarritos Mexican Soda. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_50.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..1438f03a3630bc9e79f79be959995fa4f9bd95b1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_50.txt @@ -0,0 +1,4 @@ +You can steer the model from image captioning to visual question answering by prompting it with appropriate instructions: + +prompt = [ + "Instruction: Provide an answer to the question. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_51.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e08a710bd5c2661187052254c43beaf4cd56888 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_51.txt @@ -0,0 +1,3 @@ +Use the image to answer.\n", + "https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80", + "Question: Where are these people and what's the weather like? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_52.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..134a47aff3b67ce4a5d041c9321463e1a2fc70f0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_52.txt @@ -0,0 +1,8 @@ +Answer:" + ] +inputs = processor(prompt, return_tensors="pt").to("cuda") +bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids +generated_ids = model.generate(**inputs, max_new_tokens=20, bad_words_ids=bad_words_ids) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +print(generated_text[0]) +Instruction: Provide an answer to the question. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_53.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d74f7759a49173ec6c503869c91e4d068b5090a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_53.txt @@ -0,0 +1 @@ +Use the image to answer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_54.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..b44051ec4a40e160b92c7b2f6849fa319a297fd4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_54.txt @@ -0,0 +1 @@ +Question: Where are these people and what's the weather like? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_55.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..eea9be95e8ae814fe5bfd0b2765a8effe77d46c8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_55.txt @@ -0,0 +1 @@ +Answer: They're in a park in New York City, and it's a beautiful day. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_56.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..801c23daa5d9b86c37c05a7fbe7bfd3129f1414f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_56.txt @@ -0,0 +1,3 @@ +Image classification +IDEFICS is capable of classifying images into different categories without being explicitly trained on data containing +labeled examples from those specific categories. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_57.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..c22b7aedfa111955b9f48f3db599f36d90f038f3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_57.txt @@ -0,0 +1,2 @@ +Given a list of categories and using its image and text understanding +capabilities, the model can infer which category the image likely belongs to. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_58.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..975bf6fc944f87cae97a2ee76a150a081ee71d6b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_58.txt @@ -0,0 +1,3 @@ +Say, we have this image of a vegetable stand: + +Photo by Peter Wendt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_59.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b1fac54e465cd7e781caed8f46175163aa83dbd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_59.txt @@ -0,0 +1,13 @@ +We can instruct the model to classify the image into one of the categories that we have: + +categories = ['animals','vegetables', 'city landscape', 'cars', 'office'] +prompt = [f"Instruction: Classify the following image into a single category from the following list: {categories}.\n", + "https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80", + "Category: " + ] +inputs = processor(prompt, return_tensors="pt").to("cuda") +bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids +generated_ids = model.generate(**inputs, max_new_tokens=6, bad_words_ids=bad_words_ids) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +print(generated_text[0]) +Instruction: Classify the following image into a single category from the following list: ['animals', 'vegetables', 'city landscape', 'cars', 'office']. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_60.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..844add64ce943f3c59152b7a943ea9c02876e094 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_60.txt @@ -0,0 +1,4 @@ +Category: Vegetables +``` + +In the example above we instruct the model to classify the image into a single category, however, you can also prompt the model to do rank classification. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_61.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..314f27085bbdc8a61239f76a63561309d518ea3a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_61.txt @@ -0,0 +1,2 @@ +Image-guided text generation +For more creative applications, you can use image-guided text generation to generate text based on an image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_62.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..b57404ee5d182137906804326b0b5a84f8d2f788 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_62.txt @@ -0,0 +1,2 @@ +This can be +useful to create descriptions of products, ads, descriptions of a scene, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_63.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f79b45cbb5ff4a7f8810f7c3ed9f65005493df0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_63.txt @@ -0,0 +1,3 @@ +Let's prompt IDEFICS to write a story based on a simple image of a red door: + +Photo by Craig Tidball. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_64.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..c617705acad6c474d93cab41a1e661616efcf6ec --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_64.txt @@ -0,0 +1 @@ +prompt = ["Instruction: Use the image to write a story. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_65.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..2860375e81c36ba3f67d3777875aed72263b4141 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_65.txt @@ -0,0 +1,9 @@ +\n", + "https://images.unsplash.com/photo-1517086822157-2b0358e7684a?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=2203&q=80", + "Story: \n"] +inputs = processor(prompt, return_tensors="pt").to("cuda") +bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids +generated_ids = model.generate(**inputs, num_beams=2, max_new_tokens=200, bad_words_ids=bad_words_ids) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +print(generated_text[0]) +Instruction: Use the image to write a story. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_66.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f289c93c05e692d2ef6c533de3b39ac843c6234 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_66.txt @@ -0,0 +1,2 @@ +Story: +Once upon a time, there was a little girl who lived in a house with a red door. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_67.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..4db2c8065c4681ed61e176023d5f695b1f894e8b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_67.txt @@ -0,0 +1 @@ +She loved her red door. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_68.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..a334a6c9a0e1eee771ec8b3b88fbe6db3675859f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_68.txt @@ -0,0 +1 @@ +It was the prettiest door in the whole world. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_69.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..d91398625958987fa60da54f536ca2d943609844 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_69.txt @@ -0,0 +1 @@ +One day, the little girl was playing in her yard when she noticed a man standing on her doorstep. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_70.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..220443fb0c88118bde84e127ac3473a0318f2bd1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_70.txt @@ -0,0 +1 @@ +He was wearing a long black coat and a top hat. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_71.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..79fb1fb499306ac29979860e2fa2b1ea2dd08584 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_71.txt @@ -0,0 +1 @@ +The little girl ran inside and told her mother about the man. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_72.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..889ba957a2f102c1ddb7e1ce6ce2f08b4c4b1eb6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_72.txt @@ -0,0 +1 @@ +Her mother said, “Don’t worry, honey. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_73.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..97f37a5f15c334fe8eaa91e6990a773b0dcff356 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_73.txt @@ -0,0 +1,2 @@ +He’s just a friendly ghost.†+The little girl wasn’t sure if she believed her mother, but she went outside anyway. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_74.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..74ce9d01dfe8e359df4fab4133975d2003624dfb --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_74.txt @@ -0,0 +1 @@ +When she got to the door, the man was gone. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_75.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1bcea5e3d4c1d01f130ccdd3fb860d2243a801a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_75.txt @@ -0,0 +1 @@ +The next day, the little girl was playing in her yard again when she noticed the man standing on her doorstep. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_76.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..220443fb0c88118bde84e127ac3473a0318f2bd1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_76.txt @@ -0,0 +1 @@ +He was wearing a long black coat and a top hat. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_77.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..15b1780b5e5d7104f426bb4039156f2016c8b9e9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_77.txt @@ -0,0 +1,3 @@ +The little girl ran + +Looks like IDEFICS noticed the pumpkin on the doorstep and went with a spooky Halloween story about a ghost. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_78.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fcd4d0bd0709ca3f7c35d9b2e66ec0eabcce47e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_78.txt @@ -0,0 +1 @@ +For longer outputs like this, you will greatly benefit from tweaking the text generation strategy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_79.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5377e565d697b6642f9ce6b43fb26b359ffcf33 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_79.txt @@ -0,0 +1,2 @@ +This can help +you significantly improve the quality of the generated output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_80.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..14a28909630c3051c919d7a894155e4da1a8baa7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_80.txt @@ -0,0 +1,2 @@ +Check out Text generation strategies +to learn more. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_81.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6c2e63297aa4ae6a7898855dd50e78c1fb67813 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_81.txt @@ -0,0 +1,2 @@ +Running inference in batch mode +All of the earlier sections illustrated IDEFICS for a single example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_82.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd7ab04fe79a8268045b11f2b2dc63ed7cc67c41 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_82.txt @@ -0,0 +1,22 @@ +In a very similar fashion, you can run inference +for a batch of examples by passing a list of prompts: + +prompts = [ + [ "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80", + "This is an image of ", + ], + [ "https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80", + "This is an image of ", + ], + [ "https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80", + "This is an image of ", + ], + ] +inputs = processor(prompts, return_tensors="pt").to("cuda") +bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids +generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +for i,t in enumerate(generated_text): + print(f"{i}:\n{t}\n") +0: +This is an image of the Eiffel Tower in Paris, France. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_83.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..297a65fe4b149234812d931475e9edb622ebe750 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_83.txt @@ -0,0 +1,2 @@ +1: +This is an image of a couple on a picnic blanket. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_84.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..008bf603076d6338f575b40ae6511d3438fe3c66 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_84.txt @@ -0,0 +1,2 @@ +2: +This is an image of a vegetable stand. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_85.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5bf58864b8b21eee2ac722c65c0da6557eafa3a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_85.txt @@ -0,0 +1,3 @@ +IDEFICS instruct for conversational use +For conversational use cases, you can find fine-tuned instructed versions of the model on the 🤗 Hub: +HuggingFaceM4/idefics-80b-instruct and HuggingFaceM4/idefics-9b-instruct. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_86.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9fd2711a5b6c0a9388c2c0d64cbee0bafffba51 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_86.txt @@ -0,0 +1,2 @@ +These checkpoints are the result of fine-tuning the respective base models on a mixture of supervised and instruction +fine-tuning datasets, which boosts the downstream performance while making the models more usable in conversational settings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_87.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..e662ba2d64214ffc6d95630aa4d7af28b14d588d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_87.txt @@ -0,0 +1,11 @@ +The use and prompting for the conversational use is very similar to using the base models: + +import torch +from transformers import IdeficsForVisionText2Text, AutoProcessor +device = "cuda" if torch.cuda.is_available() else "cpu" +checkpoint = "HuggingFaceM4/idefics-9b-instruct" +model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device) +processor = AutoProcessor.from_pretrained(checkpoint) +prompts = [ + [ + "User: What is in this image? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_88.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..c993520efb9168a7f8034419cd324c4e812b52f3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_88.txt @@ -0,0 +1,5 @@ +", + "https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG", + "", + + "\nAssistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_89.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..702ffa3d2a02465e430a8f8bcf4a60faeb5d480e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_89.txt @@ -0,0 +1 @@ +Idefix is running on the ground. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_90.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..96c41f34d2788085e9a8495db8e658a7f76b6395 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_90.txt @@ -0,0 +1,4 @@ +", + "\nUser:", + "https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052", + "And who is that? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_idefics/chunk_91.txt b/chunked/content_aware_chunking/tasks_idefics/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..26b84c4a7adcc5b9576dbb56c07aff2ab1e5eb21 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_idefics/chunk_91.txt @@ -0,0 +1,16 @@ +", + "\nAssistant:", + ], + ] + +--batched mode +inputs = processor(prompts, add_end_of_utterance_token=False, return_tensors="pt").to(device) +--single sample mode +inputs = processor(prompts[0], return_tensors="pt").to(device) +Generation args +exit_condition = processor.tokenizer("", add_special_tokens=False).input_ids +bad_words_ids = processor.tokenizer(["", ""], add_special_tokens=False).input_ids +generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) +for i, t in enumerate(generated_text): + print(f"{i}:\n{t}\n") \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_13.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e33de66ca40969e177d183440717973b24539a7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_13.txt @@ -0,0 +1 @@ +To do so, load the processor class associated with the model you are about to fine-tune. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_14.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..18187e3d96d09efee7ed1f6b6cf96a197db085ab --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_14.txt @@ -0,0 +1,6 @@ +thon +from transformers import AutoProcessor +checkpoint = "microsoft/git-base" +processor = AutoProcessor.from_pretrained(checkpoint) + +The processor will internally pre-process the image (which includes resizing, and pixel scaling) and tokenize the caption. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_15.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..c57984926e0a0361e7f2ce78ee3902de606b604c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_15.txt @@ -0,0 +1,11 @@ +thon +def transforms(example_batch): + images = [x for x in example_batch["image"]] + captions = [x for x in example_batch["text"]] + inputs = processor(images=images, text=captions, padding="max_length") + inputs.update({"labels": inputs["input_ids"]}) + return inputs +train_ds.set_transform(transforms) +test_ds.set_transform(transforms) + +With the dataset ready, you can now set up the model for fine-tuning. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_16.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dad67f0173746d7df400ad4359d581e2134ddc5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_16.txt @@ -0,0 +1,2 @@ +Load a base model +Load the "microsoft/git-base" into a AutoModelForCausalLM object. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_17.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d6fd0919ca6a522eeb83477a71eac78d26e2f3b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_17.txt @@ -0,0 +1,6 @@ +thon +from transformers import AutoModelForCausalLM +model = AutoModelForCausalLM.from_pretrained(checkpoint) + +Evaluate +Image captioning models are typically evaluated with the Rouge Score or Word Error Rate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_18.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc0215f0ff3bdb121f7199feebb6b70c64cf1b19 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_18.txt @@ -0,0 +1 @@ +For this guide, you will use the Word Error Rate (WER). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_19.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c854f0bbed91d9e125980eca265653705a8398e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_19.txt @@ -0,0 +1 @@ +We use the 🤗 Evaluate library to do so. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_20.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d62ab2ea4ae1a6754110b178a2b2608cb571220 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_20.txt @@ -0,0 +1 @@ +For potential limitations and other gotchas of the WER, refer to this guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_21.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe8a6d300e306590342a9dd2758656369e4d63fc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_21.txt @@ -0,0 +1,13 @@ +thon +from evaluate import load +import torch +wer = load("wer") +def compute_metrics(eval_pred): + logits, labels = eval_pred + predicted = logits.argmax(-1) + decoded_labels = processor.batch_decode(labels, skip_special_tokens=True) + decoded_predictions = processor.batch_decode(predicted, skip_special_tokens=True) + wer_score = wer.compute(predictions=decoded_predictions, references=decoded_labels) + return {"wer_score": wer_score} + +Train! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_22.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..68d9be666451c0ec0c62ad1a8a806760bb2d87a4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_22.txt @@ -0,0 +1 @@ +Now, you are ready to start fine-tuning the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_23.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..533200445b66c9456d9c53dd9350dd7aade2e127 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_23.txt @@ -0,0 +1 @@ +You will use the 🤗 [Trainer] for this. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_24.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..7db49cddb9e321ca97daf320f025086bf229e369 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_24.txt @@ -0,0 +1 @@ +First, define the training arguments using [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_25.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..896b8721115180c46aafacd329fc0b289e1915ae --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_25.txt @@ -0,0 +1,24 @@ +thon +from transformers import TrainingArguments, Trainer +model_name = checkpoint.split("/")[1] +training_args = TrainingArguments( + output_dir=f"{model_name}-pokemon", + learning_rate=5e-5, + num_train_epochs=50, + fp16=True, + per_device_train_batch_size=32, + per_device_eval_batch_size=32, + gradient_accumulation_steps=2, + save_total_limit=3, + evaluation_strategy="steps", + eval_steps=50, + save_strategy="steps", + save_steps=50, + logging_steps=50, + remove_unused_columns=False, + push_to_hub=True, + label_names=["labels"], + load_best_model_at_end=True, +) + +Then pass them along with the datasets and the model to 🤗 Trainer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_26.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..1224eed8b09d770a12c4dece0bcbb54062186443 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_26.txt @@ -0,0 +1,9 @@ +python +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_ds, + eval_dataset=test_ds, + compute_metrics=compute_metrics, +) +To start training, simply call [~Trainer.train] on the [Trainer] object. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_27.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed949b344163200f1acf1f1ccb7fd272dff1277f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_27.txt @@ -0,0 +1,3 @@ +python +trainer.train() +You should see the training loss drop smoothly as training progresses. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_28.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..aeeec308d4e0cfc793f9af88014c56259091c9f5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_28.txt @@ -0,0 +1,5 @@ +Once training is completed, share your model to the Hub with the [~Trainer.push_to_hub] method so everyone can use your model: +python +trainer.push_to_hub() +Inference +Take a sample image from test_ds to test the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_29.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad6b6ef46055cd1fee942de3ef7b17675dd88307 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_29.txt @@ -0,0 +1,8 @@ +thon +from PIL import Image +import requests +url = "https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/pokemon.png" +image = Image.open(requests.get(url, stream=True).raw) +image + +Prepare image for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_30.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..365a71dc914d5215594f7b1cb511b98ac741d2a6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_30.txt @@ -0,0 +1,6 @@ +thon +device = "cuda" if torch.cuda.is_available() else "cpu" +inputs = processor(images=image, return_tensors="pt").to(device) +pixel_values = inputs.pixel_values + +Call [generate] and decode the predictions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_captioning/chunk_31.txt b/chunked/content_aware_chunking/tasks_image_captioning/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f897ef968d3b3986cc68cc0e441251976d521dc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_captioning/chunk_31.txt @@ -0,0 +1,7 @@ +python +generated_ids = model.generate(pixel_values=pixel_values, max_length=50) +generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0] +print(generated_caption) + +a drawing of a pink and blue pokemon +Looks like the fine-tuned model generated a pretty good caption! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_15.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..21273682e1dd557e9bacb308ad01c5c05b90ced1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_15.txt @@ -0,0 +1,11 @@ +images = [ + val_data_augmentation(convert_to_tf_tensor(image.convert("RGB"))) for image in example_batch["image"] + ] + example_batch["pixel_values"] = [tf.transpose(tf.squeeze(image)) for image in images] + return example_batch + +Use 🤗 Datasets [~datasets.Dataset.set_transform] to apply the transformations on the fly: +py +food["train"].set_transform(preprocess_train) +food["test"].set_transform(preprocess_val) +As a final preprocessing step, create a batch of examples using DefaultDataCollator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_16.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..753cb23426bfe03b51447b4e0541edfa25eac9ab --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_16.txt @@ -0,0 +1,2 @@ +Unlike other data collators in 🤗 Transformers, the +DefaultDataCollator does not apply additional preprocessing, such as padding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_17.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea27bdaa6b8a1a2344f6ac8effc17540ee623480 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_17.txt @@ -0,0 +1,5 @@ +from transformers import DefaultDataCollator +data_collator = DefaultDataCollator(return_tensors="tf") + +Evaluate +Including a metric during training is often helpful for evaluating your model's performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_18.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..5349d2c902871fffcd87a0efece690f0751fd081 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_18.txt @@ -0,0 +1,2 @@ +You can quickly load an +evaluation method with the 🤗 Evaluate library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_19.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c397de66037bb449284da27520e08708b05dd88 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_19.txt @@ -0,0 +1,15 @@ +For this task, load +the accuracy metric (see the 🤗 Evaluate quick tour to learn more about how to load and compute a metric): + +import evaluate +accuracy = evaluate.load("accuracy") + +Then create a function that passes your predictions and labels to [~evaluate.EvaluationModule.compute] to calculate the accuracy: + +import numpy as np +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + return accuracy.compute(predictions=predictions, references=labels) + +Your compute_metrics function is ready to go now, and you'll return to it when you set up your training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_20.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f493faa6fd8490ca5e6198c99a80a5e6e236cf1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_20.txt @@ -0,0 +1,3 @@ +Train + +If you aren't familiar with finetuning a model with the [Trainer], take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_21.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..b606df7180c2e2be472d63ede6d17d92a9485791 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_21.txt @@ -0,0 +1 @@ +You're ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_22.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..be136cecfafef4ef4cd90f10e7c2299de5795519 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_22.txt @@ -0,0 +1 @@ +Load ViT with [AutoModelForImageClassification]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_23.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..235782d452dda322d7ecf038aeca3f5648ac0828 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_23.txt @@ -0,0 +1,13 @@ +Specify the number of labels along with the number of expected labels, and the label mappings: + +from transformers import AutoModelForImageClassification, TrainingArguments, Trainer +model = AutoModelForImageClassification.from_pretrained( + checkpoint, + num_labels=len(labels), + id2label=id2label, + label2id=label2id, + ) + +At this point, only three steps remain: + +Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_24.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..9927197b4b60b9768f2a1fd4dc1b4cfcdc94bf1d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_24.txt @@ -0,0 +1 @@ +It is important you don't remove unused columns because that'll drop the image column. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_25.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..e341b4d05beb0aeabb3c2b93ba2d643d64906272 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_25.txt @@ -0,0 +1 @@ +Without the image column, you can't create pixel_values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_26.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..841530cb0c0b51477d1c178cac331ec56c22c80a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_26.txt @@ -0,0 +1 @@ +Set remove_unused_columns=False to prevent this behavior! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_27.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c1ad0b6b2c9b9d6e78a522b09d9d023e6c3842d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_27.txt @@ -0,0 +1 @@ +The only other required parameter is output_dir which specifies where to save your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_28.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_28.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_29.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c977d29a14cc4699a4b06a477bb52c2490bdc8e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_29.txt @@ -0,0 +1 @@ +At the end of each epoch, the [Trainer] will evaluate the accuracy and save the training checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_30.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..643c2a1221f0298c2d0da3f12550f92d36f0b4a3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_30.txt @@ -0,0 +1 @@ +Pass the training arguments to [Trainer] along with the model, dataset, tokenizer, data collator, and compute_metrics function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_31.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_31.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_32.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc0c68f33804e88be6a78473d9e3377efc3d83c3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_32.txt @@ -0,0 +1,32 @@ +training_args = TrainingArguments( + output_dir="my_awesome_food_model", + remove_unused_columns=False, + evaluation_strategy="epoch", + save_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=16, + gradient_accumulation_steps=4, + per_device_eval_batch_size=16, + num_train_epochs=3, + warmup_ratio=0.1, + logging_steps=10, + load_best_model_at_end=True, + metric_for_best_model="accuracy", + push_to_hub=True, + ) +trainer = Trainer( + model=model, + args=training_args, + data_collator=data_collator, + train_dataset=food["train"], + eval_dataset=food["test"], + tokenizer=image_processor, + compute_metrics=compute_metrics, + ) +trainer.train() + +Once training is completed, share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +If you are unfamiliar with fine-tuning a model with Keras, check out the basic tutorial first! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_33.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d13aec409721c5e7024766b2ac948287e12ae57 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_33.txt @@ -0,0 +1,2 @@ +To fine-tune a model in TensorFlow, follow these steps: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_34.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..6641a1f2b655a4b38d7e344f5abc14eb31d665e2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_34.txt @@ -0,0 +1 @@ +Define the training hyperparameters, and set up an optimizer and a learning rate schedule. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_35.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_35.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_36.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..15cea3797701e4511461a421811d551ce4a8bf15 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_36.txt @@ -0,0 +1 @@ +Instantiate a pre-trained model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_37.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_37.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_38.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..87bb1fd7a7425780f38c7b9dcf7df5d17d561eb4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_38.txt @@ -0,0 +1 @@ +Convert a 🤗 Dataset to a tf.data.Dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_39.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd249e29f9fefd5ef6445828f1394f228bd97f1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_39.txt @@ -0,0 +1 @@ +4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_40.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff34899a31366c166b93096a5c69a295c12ef1bc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_40.txt @@ -0,0 +1 @@ +Compile your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_41.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..91dcb6a87071975adc555db222107a0056de804e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_41.txt @@ -0,0 +1 @@ +5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_42.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb980c9ae1c3471fe5258be2e71dfde2f2b1563f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_42.txt @@ -0,0 +1 @@ +Add callbacks and use the fit() method to run the training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_43.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..66aa82048927681af984d35e45d694eaf46a34f9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_43.txt @@ -0,0 +1 @@ +6. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_44.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..785726c71dd5c8c5280dc114f65bf745a2f8e5b4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_44.txt @@ -0,0 +1 @@ +Upload your model to 🤗 Hub to share with the community. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_45.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9a2ee412b88d9d3981a0d91ad273fd3a2c1f795 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_45.txt @@ -0,0 +1,42 @@ +Start by defining the hyperparameters, optimizer and learning rate schedule: + +from transformers import create_optimizer +batch_size = 16 +num_epochs = 5 +num_train_steps = len(food["train"]) * num_epochs +learning_rate = 3e-5 +weight_decay_rate = 0.01 +optimizer, lr_schedule = create_optimizer( + init_lr=learning_rate, + num_train_steps=num_train_steps, + weight_decay_rate=weight_decay_rate, + num_warmup_steps=0, + ) + +Then, load ViT with [TFAutoModelForImageClassification] along with the label mappings: + +from transformers import TFAutoModelForImageClassification +model = TFAutoModelForImageClassification.from_pretrained( + checkpoint, + id2label=id2label, + label2id=label2id, + ) + +Convert your datasets to the tf.data.Dataset format using the [~datasets.Dataset.to_tf_dataset] and your data_collator: + +converting our train dataset to tf.data.Dataset +tf_train_dataset = food["train"].to_tf_dataset( + columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator + ) +converting our test dataset to tf.data.Dataset +tf_eval_dataset = food["test"].to_tf_dataset( + columns="pixel_values", label_cols="label", shuffle=True, batch_size=batch_size, collate_fn=data_collator + ) + +Configure the model for training with compile(): + +from tensorflow.keras.losses import SparseCategoricalCrossentropy +loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) +model.compile(optimizer=optimizer, loss=loss) + +To compute the accuracy from the predictions and push your model to the 🤗 Hub, use Keras callbacks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_46.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f3642f2822fec518ed9609e6aeeb257c0243070 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_46.txt @@ -0,0 +1,13 @@ +Pass your compute_metrics function to KerasMetricCallback, +and use the PushToHubCallback to upload the model: + +from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback +metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset) +push_to_hub_callback = PushToHubCallback( + output_dir="food_classifier", + tokenizer=image_processor, + save_strategy="no", + ) +callbacks = [metric_callback, push_to_hub_callback] + +Finally, you are ready to train your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_47.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ce26b99c5a3c3d46afe1fd094789e2ed12728e7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_47.txt @@ -0,0 +1,16 @@ +Call fit() with your training and validation datasets, the number of epochs, +and your callbacks to fine-tune the model: + +model.fit(tf_train_dataset, validation_data=tf_eval_dataset, epochs=num_epochs, callbacks=callbacks) +Epoch 1/5 +250/250 [==============================] - 313s 1s/step - loss: 2.5623 - val_loss: 1.4161 - accuracy: 0.9290 +Epoch 2/5 +250/250 [==============================] - 265s 1s/step - loss: 0.9181 - val_loss: 0.6808 - accuracy: 0.9690 +Epoch 3/5 +250/250 [==============================] - 252s 1s/step - loss: 0.3910 - val_loss: 0.4303 - accuracy: 0.9820 +Epoch 4/5 +250/250 [==============================] - 251s 1s/step - loss: 0.2028 - val_loss: 0.3191 - accuracy: 0.9900 +Epoch 5/5 +250/250 [==============================] - 238s 949ms/step - loss: 0.1232 - val_loss: 0.3259 - accuracy: 0.9890 + +Congratulations! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_48.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..d47f333d728e4db8acd5b419bce2bc1b47ee091a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_48.txt @@ -0,0 +1 @@ +You have fine-tuned your model and shared it on the 🤗 Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_49.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..393d90e98de078560c7b3125197696b7b429d858 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_49.txt @@ -0,0 +1 @@ +You can now use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_50.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffb3485327fcec6474c9722a05a48dc176ac16a7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_50.txt @@ -0,0 +1 @@ +For a more in-depth example of how to finetune a model for image classification, take a look at the corresponding PyTorch notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_51.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..16c8b623c1c9d3914cd6a58f69897f2af5ad73db --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_51.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've fine-tuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_52.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ab273696f7ecc02f551f62bd55fe0e07d95ef57 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_52.txt @@ -0,0 +1,6 @@ +Load an image you'd like to run inference on: + +ds = load_dataset("food101", split="validation[:10]") +image = ds["image"][0] + +The simplest way to try out your finetuned model for inference is to use it in a [pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_classification/chunk_53.txt b/chunked/content_aware_chunking/tasks_image_classification/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e28e5d0d91716c2d8ffd47f61a52db8d2579f0a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_classification/chunk_53.txt @@ -0,0 +1,50 @@ +Instantiate a pipeline for image classification with your model, and pass your image to it: + +from transformers import pipeline +classifier = pipeline("image-classification", model="my_awesome_food_model") +classifier(image) +[{'score': 0.31856709718704224, 'label': 'beignets'}, + {'score': 0.015232225880026817, 'label': 'bruschetta'}, + {'score': 0.01519392803311348, 'label': 'chicken_wings'}, + {'score': 0.013022331520915031, 'label': 'pork_chop'}, + {'score': 0.012728818692266941, 'label': 'prime_rib'}] + +You can also manually replicate the results of the pipeline if you'd like: + +Load an image processor to preprocess the image and return the input as PyTorch tensors: + +from transformers import AutoImageProcessor +import torch +image_processor = AutoImageProcessor.from_pretrained("my_awesome_food_model") +inputs = image_processor(image, return_tensors="pt") + +Pass your inputs to the model and return the logits: + +from transformers import AutoModelForImageClassification +model = AutoModelForImageClassification.from_pretrained("my_awesome_food_model") +with torch.no_grad(): + logits = model(**inputs).logits + +Get the predicted label with the highest probability, and use the model's id2label mapping to convert it to a label: + +predicted_label = logits.argmax(-1).item() +model.config.id2label[predicted_label] +'beignets' + +Load an image processor to preprocess the image and return the input as TensorFlow tensors: + +from transformers import AutoImageProcessor +image_processor = AutoImageProcessor.from_pretrained("MariaK/food_classifier") +inputs = image_processor(image, return_tensors="tf") + +Pass your inputs to the model and return the logits: + +from transformers import TFAutoModelForImageClassification +model = TFAutoModelForImageClassification.from_pretrained("MariaK/food_classifier") +logits = model(**inputs).logits + +Get the predicted label with the highest probability, and use the model's id2label mapping to convert it to a label: + +predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0]) +model.config.id2label[predicted_class_id] +'beignets' \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_10.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a798932c16cfba5fa3ac20eab1bd57481363e94 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_10.txt @@ -0,0 +1 @@ +We will get an upscaled version of the cat image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_11.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..00d3c31147e95b2339c2c5bd1e9e8dbf19b48016 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_11.txt @@ -0,0 +1,7 @@ +python +upscaled = pipe(image) +print(upscaled.size) +```bash +(1072, 880) + +If you wish to do inference yourself with no pipeline, you can use the Swin2SRForImageSuperResolution and Swin2SRImageProcessor classes of transformers. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_12.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..12c696d1690e4d862278f60bda5988235aa2c4c1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_12.txt @@ -0,0 +1 @@ +We will use the same model checkpoint for this. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_13.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..158e8f02d31aa922e36ad6f8ba634a721224383d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_13.txt @@ -0,0 +1 @@ +Let's initialize the model and the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_14.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2da087030cd4c846a7100182916845eb9ea70cc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_14.txt @@ -0,0 +1,6 @@ +thon +from transformers import Swin2SRForImageSuperResolution, Swin2SRImageProcessor +model = Swin2SRForImageSuperResolution.from_pretrained("caidas/swin2SR-lightweight-x2-64").to(device) +processor = Swin2SRImageProcessor("caidas/swin2SR-lightweight-x2-64") + +pipeline abstracts away the preprocessing and postprocessing steps that we have to do ourselves, so let's preprocess the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_15.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfecbf71458086b3abfafcc37294988e2baf1f79 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_15.txt @@ -0,0 +1 @@ +We will pass the image to the processor and then move the pixel values to GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_16.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..899c6104733418d914b1cb38d0fb72151defd84a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_16.txt @@ -0,0 +1,6 @@ +thon +pixel_values = processor(image, return_tensors="pt").pixel_values +print(pixel_values.shape) +pixel_values = pixel_values.to(device) + +We can now infer the image by passing pixel values to the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_17.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc8e9f378a978abb2e7e24cdf38e19fae4891a67 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_17.txt @@ -0,0 +1,15 @@ +thon +import torch +with torch.no_grad(): + outputs = model(pixel_values) +`` +Output is an object of typeImageSuperResolutionOutput` that looks like below 👇 +(loss=None, reconstruction=tensor([[[[0.8270, 0.8269, 0.8275, , 0.7463, 0.7446, 0.7453], + [0.8287, 0.8278, 0.8283, , 0.7451, 0.7448, 0.7457], + [0.8280, 0.8273, 0.8269, , 0.7447, 0.7446, 0.7452], + , + [0.5923, 0.5933, 0.5924, , 0.0697, 0.0695, 0.0706], + [0.5926, 0.5932, 0.5926, , 0.0673, 0.0687, 0.0705], + [0.5927, 0.5914, 0.5922, , 0.0664, 0.0694, 0.0718]]]], + device='cuda:0'), hidden_states=None, attentions=None) +We need to get the reconstruction and post-process it for visualization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_18.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f9b2159b561669cf63d36a392cb076f37292084 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_18.txt @@ -0,0 +1 @@ +Let's see how it looks like. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_19.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..bed0c623c3d8c26524319398163d4d678ca0bf36 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_19.txt @@ -0,0 +1,5 @@ +thon +outputs.reconstruction.data.shape +torch.Size([1, 3, 880, 1072]) + +We need to squeeze the output and get rid of axis 0, clip the values, then convert it to be numpy float. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_20.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..a479d804b3590df8a782388b5e6a33d34049e602 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_20.txt @@ -0,0 +1 @@ +Then we will arrange axes to have the shape [1072, 880], and finally, bring the output back to range [0, 255]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_21.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8410fc210bad3d16e6c92d0a8c532ba380e5587 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_21.txt @@ -0,0 +1,9 @@ +thon +import numpy as np +squeeze, take to CPU and clip the values +output = outputs.reconstruction.data.squeeze().cpu().clamp_(0, 1).numpy() +rearrange the axes +output = np.moveaxis(output, source=0, destination=-1) +bring values back to pixel values range +output = (output * 255.0).round().astype(np.uint8) +Image.fromarray(output) \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_image_to_image/chunk_9.txt b/chunked/content_aware_chunking/tasks_image_to_image/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b19b192a305582b6d240d3e0a6262fa4e1d8e638 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_image_to_image/chunk_9.txt @@ -0,0 +1,10 @@ +thon +from PIL import Image +import requests +url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/cat.jpg" +image = Image.open(requests.get(url, stream=True).raw) +print(image.size) +bash +(532, 432) + +We can now do inference with the pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_13.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f8f7e3d58fcec0af5b71d486be3dc59ce6dcaad --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_13.txt @@ -0,0 +1 @@ +We will use the map() method of dataset to apply the preprocessing to every split of the dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_14.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb6320a568776dc827f1aa04cb374effe52822c1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_14.txt @@ -0,0 +1,9 @@ +thon +from transformers import AutoImageProcessor +teacher_processor = AutoImageProcessor.from_pretrained("merve/beans-vit-224") +def process(examples): + processed_inputs = teacher_processor(examples["image"]) + return processed_inputs +processed_datasets = dataset.map(process, batched=True) + +Essentially, we want the student model (a randomly initialized MobileNet) to mimic the teacher model (fine-tuned vision transformer). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_15.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..7436824eb391c629a1dbab215ea5e784673a4a09 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_15.txt @@ -0,0 +1 @@ +To achieve this, we first get the logits output from the teacher and the student. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_16.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d096aa1d46fff867b95cd330f2c1e6237bafdb7e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_16.txt @@ -0,0 +1 @@ +Then, we divide each of them by the parameter temperature which controls the importance of each soft target. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_17.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b2f28db7f6878f01a6ca25931ed83629a28901a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_17.txt @@ -0,0 +1 @@ +A parameter called lambda weighs the importance of the distillation loss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_18.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..de0b790f11f1f5384ab7993c8a105686cfe7cab1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_18.txt @@ -0,0 +1 @@ +In this example, we will use temperature=5 and lambda=0.5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_19.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b7b8d2b6a58c4f3ced102880c1be1d9e6c70a5f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_19.txt @@ -0,0 +1 @@ +We will use the Kullback-Leibler Divergence loss to compute the divergence between the student and teacher. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_20.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..60fc344e9c4b77886c6afe3c47cbace4775ead48 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_20.txt @@ -0,0 +1 @@ +Given two data P and Q, KL Divergence explains how much extra information we need to represent P using Q. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_21.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c6b10ecc1ea040582c8150d3e6ebdaec99a0aff --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_21.txt @@ -0,0 +1 @@ +If two are identical, their KL divergence is zero, as there's no other information needed to explain P from Q. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_22.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..4943d0800991b640ff3fd4636e72ded83f80fd05 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_22.txt @@ -0,0 +1 @@ +Thus, in the context of knowledge distillation, KL divergence is useful. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_23.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..a563ed930dfe81710b7728e939ac06698052a8cb --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_23.txt @@ -0,0 +1,34 @@ +thon +from transformers import TrainingArguments, Trainer +import torch +import torch.nn as nn +import torch.nn.functional as F +class ImageDistilTrainer(Trainer): + def init(self, teacher_model=None, student_model=None, temperature=None, lambda_param=None, args, kwargs): + super().init(model=student_model, args, **kwargs) + self.teacher = teacher_model + self.student = student_model + self.loss_function = nn.KLDivLoss(reduction="batchmean") + device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') + self.teacher.to(device) + self.teacher.eval() + self.temperature = temperature + self.lambda_param = lambda_param +def compute_loss(self, student, inputs, return_outputs=False): + student_output = self.student(**inputs) + + with torch.no_grad(): + teacher_output = self.teacher(**inputs) + + # Compute soft targets for teacher and student + soft_teacher = F.softmax(teacher_output.logits / self.temperature, dim=-1) + soft_student = F.log_softmax(student_output.logits / self.temperature, dim=-1) + + # Compute the loss + distillation_loss = self.loss_function(soft_student, soft_teacher) * (self.temperature ** 2) + + # Compute the true label loss + student_target_loss = student_output.loss + + # Calculate final loss + loss = (1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_24.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..10f63f17d778bebe346ed2ee3521ca1d23d898a9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_24.txt @@ -0,0 +1,4 @@ +- self.lambda_param) * student_target_loss + self.lambda_param * distillation_loss + return (loss, student_output) if return_outputs else loss + +We will now login to Hugging Face Hub so we can push our model to the Hugging Face Hub through the Trainer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_25.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..21b370eb8b143cc938014a09089f4b035f4176fc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_25.txt @@ -0,0 +1,5 @@ +thon +from huggingface_hub import notebook_login +notebook_login() + +Let's set the TrainingArguments, the teacher model and the student model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_26.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c665ce42a98562091742bd500ae8fc7c66c53c3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_26.txt @@ -0,0 +1,30 @@ +thon +from transformers import AutoModelForImageClassification, MobileNetV2Config, MobileNetV2ForImageClassification +training_args = TrainingArguments( + output_dir="my-awesome-model", + num_train_epochs=30, + fp16=True, + logging_dir=f"{repo_name}/logs", + logging_strategy="epoch", + evaluation_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + metric_for_best_model="accuracy", + report_to="tensorboard", + push_to_hub=True, + hub_strategy="every_save", + hub_model_id=repo_name, + ) +num_labels = len(processed_datasets["train"].features["labels"].names) +initialize models +teacher_model = AutoModelForImageClassification.from_pretrained( + "merve/beans-vit-224", + num_labels=num_labels, + ignore_mismatched_sizes=True +) +training MobileNetV2 from scratch +student_config = MobileNetV2Config() +student_config.num_labels = num_labels +student_model = MobileNetV2ForImageClassification(student_config) + +We can use compute_metrics function to evaluate our model on the test set. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_27.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..eeef4b61d8c82a0f138a1735885972405cfef0c2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_27.txt @@ -0,0 +1 @@ +This function will be used during the training process to compute the accuracy & f1 of our model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_28.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..88be0e2bee4d5cc32b19c4d7d6017f3f8eb679fd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_28.txt @@ -0,0 +1,10 @@ +thon +import evaluate +import numpy as np +accuracy = evaluate.load("accuracy") +def compute_metrics(eval_pred): + predictions, labels = eval_pred + acc = accuracy.compute(references=labels, predictions=np.argmax(predictions, axis=1)) + return {"accuracy": acc["accuracy"]} + +Let's initialize the Trainer with the training arguments we defined. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_29.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..0219d10ea25b6e925842ca8a6b67834e20aac6be --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_29.txt @@ -0,0 +1 @@ +We will also initialize our data collator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_30.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab73f446fc58326664bffd79796e841aa49b4b77 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_30.txt @@ -0,0 +1,17 @@ +thon +from transformers import DefaultDataCollator +data_collator = DefaultDataCollator() +trainer = ImageDistilTrainer( + student_model=student_model, + teacher_model=teacher_model, + training_args=training_args, + train_dataset=processed_datasets["train"], + eval_dataset=processed_datasets["validation"], + data_collator=data_collator, + tokenizer=teacher_processor, + compute_metrics=compute_metrics, + temperature=5, + lambda_param=0.5 +) + +We can now train our model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_31.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..618f3274916debdf020350e4327faed55eca3215 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_31.txt @@ -0,0 +1,3 @@ +python +trainer.train() +We can evaluate the model on the test set. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_32.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bc29f0d5a478ceff7b7c532d559c86c67d26b87 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_32.txt @@ -0,0 +1,3 @@ +python +trainer.evaluate(processed_datasets["test"]) +On test set, our model reaches 72 percent accuracy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_33.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7b74b1cb722fc39bef372d66d9056e2be86f456 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_33.txt @@ -0,0 +1 @@ +To have a sanity check over efficiency of distillation, we also trained MobileNet on the beans dataset from scratch with the same hyperparameters and observed 63 percent accuracy on the test set. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_34.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..df6625654f18e078e47faec9d1cdcd4dde829f34 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_34.txt @@ -0,0 +1 @@ +We invite the readers to try different pre-trained teacher models, student architectures, distillation parameters and report their findings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_35.txt b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2755c5818c501dee4c5a447aaaba051fc3686d5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_knowledge_distillation_for_image_classification/chunk_35.txt @@ -0,0 +1 @@ +The training logs and checkpoints for distilled model can be found in this repository, and MobileNetV2 trained from scratch can be found in this repository. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_26.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cb1f7bc79e431ad57e87b80eec7bcdc736deca5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_26.txt @@ -0,0 +1 @@ +It has to be reconciled with a vastly different house bill and then passed again. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_27.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..54de386ba1eed8f5718b4e6928bafd18251e5d2e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_27.txt @@ -0,0 +1,2 @@ +', + 'Also: does this apply to 2017 taxes? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_28.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..26c67fdd01031dad9dcb85004c4a80b47cadd9a9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_28.txt @@ -0,0 +1 @@ +Or does it start with 2018 taxes? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_29.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f92792bceca738fcf9e128982926ec3e34d67ed --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_29.txt @@ -0,0 +1,2 @@ +', + 'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_30.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..22894f3d4d59db155ad81ce6898d71ee2b94b777 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_30.txt @@ -0,0 +1,10 @@ +URL_0'], + 'answers.score': [21, 19, 5, 3], + 'answers.text_urls': [[], + [], + [], + ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']], + 'title_urls': ['url'], + 'selftext_urls': ['url']} + +Each subfield is now a separate column as indicated by the answers prefix, and the text field is a list now. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_31.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb783564d5730e5f137a06b44fb36bb2d4598331 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_31.txt @@ -0,0 +1,2 @@ +Instead +of tokenizing each sentence separately, convert the list to a string so you can jointly tokenize them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_32.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..68bab97b4b20a3d5500bdc4f62cf416e695b01d5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_32.txt @@ -0,0 +1,6 @@ +Here is a first preprocessing function to join the list of strings for each example and tokenize the result: + +def preprocess_function(examples): + return tokenizer([" ".join(x) for x in examples["answers.text"]]) + +To apply this preprocessing function over the entire dataset, use the 🤗 Datasets [~datasets.Dataset.map] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_33.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..4822b855ad8b8c1d8407956723ccef02d018e729 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_33.txt @@ -0,0 +1 @@ +You can speed up the map function by setting batched=True to process multiple elements of the dataset at once, and increasing the number of processes with num_proc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_34.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0f8455941a2ec66e1c2c003a5aae481ba7e4a9b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_34.txt @@ -0,0 +1,10 @@ +Remove any columns you don't need: + +tokenized_eli5 = eli5.map( + preprocess_function, + batched=True, + num_proc=4, + remove_columns=eli5["train"].column_names, + ) + +This dataset contains the token sequences, but some of these are longer than the maximum input length for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_35.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0f0379b244698d11406908e4149c266ff1dc2f9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_35.txt @@ -0,0 +1,4 @@ +You can now use a second preprocessing function to + +concatenate all the sequences +split the concatenated sequences into shorter chunks defined by block_size, which should be both shorter than the maximum input length and short enough for your GPU RAM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_36.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa808b8cbd68cce7df99fd606fab83a97b71277f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_36.txt @@ -0,0 +1,3 @@ +block_size = 128 +def group_texts(examples): + # Concatenate all texts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_37.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..65d30289184e0a0e5e2787b15ffd4e8e7c4c3625 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_37.txt @@ -0,0 +1,4 @@ +concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_38.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..81c2774053c75a3e28c8aa8a9496def1b8aa7e9d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_38.txt @@ -0,0 +1,3 @@ +if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of block_size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_39.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff6ebaf1cb5cdcc7e6f006a617d57347a2084aaa --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_39.txt @@ -0,0 +1,12 @@ +result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + result["labels"] = result["input_ids"].copy() + return result + +Apply the group_texts function over the entire dataset: + +lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4) + +Now create a batch of examples using [DataCollatorForLanguageModeling]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_40.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4262d840fed45f68dfd8d032ef96dbc90c2101e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_40.txt @@ -0,0 +1,2 @@ +It's more efficient to dynamically pad the +sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_41.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f99d0848b26478f56e4eb3c012fbefd2a63c9e7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_41.txt @@ -0,0 +1 @@ +Use the end-of-sequence token as the padding token and set mlm=False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_42.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..a47d3b648df0d8a0f5aa64b0a0f69f8e2bb7b5fa --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_42.txt @@ -0,0 +1,7 @@ +This will use the inputs as labels shifted to the right by one element: + +from transformers import DataCollatorForLanguageModeling +tokenizer.pad_token = tokenizer.eos_token +data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) + +Use the end-of-sequence token as the padding token and set mlm=False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_43.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d85a780c9f77800dac70d87225ccb52b35cd829 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_43.txt @@ -0,0 +1,8 @@ +This will use the inputs as labels shifted to the right by one element: + +from transformers import DataCollatorForLanguageModeling +data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf") + +Train + +If you aren't familiar with finetuning a model with the [Trainer], take a look at the basic tutorial! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_44.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..b606df7180c2e2be472d63ede6d17d92a9485791 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_44.txt @@ -0,0 +1 @@ +You're ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_45.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..3779ffa6ff6539ca1e647ebc6a4283c47e7fda28 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_45.txt @@ -0,0 +1,8 @@ +Load DistilGPT2 with [AutoModelForCausalLM]: + +from transformers import AutoModelForCausalLM, TrainingArguments, Trainer +model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") + +At this point, only three steps remain: + +Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_46.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..be198e14862c0c12a30c53e206e6011dc146c468 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_46.txt @@ -0,0 +1 @@ +The only required parameter is output_dir which specifies where to save your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_47.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_47.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_48.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..30319a39626a65879f6dbe581d6130a6d3822707 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_48.txt @@ -0,0 +1 @@ +Pass the training arguments to [Trainer] along with the model, datasets, and data collator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_49.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_49.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_50.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f9d3a05ee0802277a3bb3fc9193a6a9bcdb8dd7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_50.txt @@ -0,0 +1,28 @@ +training_args = TrainingArguments( + output_dir="my_awesome_eli5_clm-model", + evaluation_strategy="epoch", + learning_rate=2e-5, + weight_decay=0.01, + push_to_hub=True, + ) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=lm_dataset["train"], + eval_dataset=lm_dataset["test"], + data_collator=data_collator, + ) +trainer.train() + +Once training is completed, use the [~transformers.Trainer.evaluate] method to evaluate your model and get its perplexity: + +import math +eval_results = trainer.evaluate() +print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") +Perplexity: 49.61 + +Then share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_51.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..9103fc1e26414848d15cc9f49c620438c7de62f5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_51.txt @@ -0,0 +1,26 @@ +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +from transformers import create_optimizer, AdamWeightDecay +optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) + +Then you can load DistilGPT2 with [TFAutoModelForCausalLM]: + +from transformers import TFAutoModelForCausalLM +model = TFAutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") + +Convert your datasets to the tf.data.Dataset format with [~transformers.TFPreTrainedModel.prepare_tf_dataset]: + +tf_train_set = model.prepare_tf_dataset( + lm_dataset["train"], + shuffle=True, + batch_size=16, + collate_fn=data_collator, + ) +tf_test_set = model.prepare_tf_dataset( + lm_dataset["test"], + shuffle=False, + batch_size=16, + collate_fn=data_collator, + ) + +Configure the model for training with compile. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_52.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cbbc39c9ffbdd5fa9f0d5eeb9a57372b57f83cd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_52.txt @@ -0,0 +1,4 @@ +Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +import tensorflow as tf +model.compile(optimizer=optimizer) # No loss argument! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_53.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..2412a6b0eb2d30f91c3c1001ddecb53590e7ea47 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_53.txt @@ -0,0 +1,9 @@ +This can be done by specifying where to push your model and tokenizer in the [~transformers.PushToHubCallback]: + +from transformers.keras_callbacks import PushToHubCallback +callback = PushToHubCallback( + output_dir="my_awesome_eli5_clm-model", + tokenizer=tokenizer, + ) + +Finally, you're ready to start training your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_54.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0fc51dbe0099df22455879ebf4807adfceacba6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_54.txt @@ -0,0 +1,5 @@ +Call fit with your training and validation datasets, the number of epochs, and your callback to finetune the model: + +model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback]) + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_55.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca7f742c12c25fe9dd5901b922aed047686ea4e7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_55.txt @@ -0,0 +1,3 @@ +For a more in-depth example of how to finetune a model for causal language modeling, take a look at the corresponding +PyTorch notebook +or TensorFlow notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_56.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5ec9281024b79527faf7b28b94415e6100aee5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_56.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've finetuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_57.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..517540a988cec5c4511386f9a1893e004205a4e9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_57.txt @@ -0,0 +1,5 @@ +Come up with a prompt you'd like to generate text from: + +prompt = "Somatic hypermutation allows the immune system to" + +The simplest way to try out your finetuned model for inference is to use it in a [pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_58.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b1796eefa668212da5ce5d04460c8a699b128d2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_58.txt @@ -0,0 +1,6 @@ +Instantiate a pipeline for text generation with your model, and pass your text to it: + +from transformers import pipeline +generator = pipeline("text-generation", model="username/my_awesome_eli5_clm-model") +generator(prompt) +[{'generated_text': "Somatic hypermutation allows the immune system to be able to effectively reverse the damage caused by an infection.\n\n\nThe damage caused by an infection is caused by the immune system's ability to perform its own self-correcting tasks."}] \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_59.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..dddea5ed0294d9e7ba6a455f1b9f113ae5bcda3d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_59.txt @@ -0,0 +1,7 @@ +Tokenize the text and return the input_ids as PyTorch tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_clm-model") +inputs = tokenizer(prompt, return_tensors="pt").input_ids + +Use the [~transformers.generation_utils.GenerationMixin.generate] method to generate text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_60.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..10d0e5f67e1e7c5266c63e2fb6c8cec70500f6b9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_60.txt @@ -0,0 +1 @@ +For more details about the different text generation strategies and parameters for controlling generation, check out the Text generation strategies page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_61.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..99dfdd3cca1bf2c72ff4be057a7a3d97d049483b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_61.txt @@ -0,0 +1,8 @@ +from transformers import AutoModelForCausalLM +model = AutoModelForCausalLM.from_pretrained("username/my_awesome_eli5_clm-model") +outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95) + +Decode the generated token ids back into text: + +tokenizer.batch_decode(outputs, skip_special_tokens=True) +["Somatic hypermutation allows the immune system to react to drugs with the ability to adapt to a different environmental situation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_62.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..e37ea5a9ebe70cbc428b40f408a9e73e2272e904 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_62.txt @@ -0,0 +1 @@ +In other words, a system of 'hypermutation' can help the immune system to adapt to a different environmental situation or in some cases even a single life. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_63.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..55f4c7cebc8a63bc7b12a5320f2925309a97673f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_63.txt @@ -0,0 +1 @@ +In contrast, researchers at the University of Massachusetts-Boston have found that 'hypermutation' is much stronger in mice than in humans but can be found in humans, and that it's not completely unknown to the immune system. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_64.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..6daadb5651cb7947b23c0f0a5f295a38c14adb9f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_64.txt @@ -0,0 +1,11 @@ +A study on how the immune system"] +`` + + +Tokenize the text and return theinput_ids` as TensorFlow tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_clm-model") +inputs = tokenizer(prompt, return_tensors="tf").input_ids + +Use the [~transformers.generation_tf_utils.TFGenerationMixin.generate] method to create the summarization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_65.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..10d0e5f67e1e7c5266c63e2fb6c8cec70500f6b9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_65.txt @@ -0,0 +1 @@ +For more details about the different text generation strategies and parameters for controlling generation, check out the Text generation strategies page. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_66.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..c585914de3646a8bc55582c35c12154b9ea537ca --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_66.txt @@ -0,0 +1,8 @@ +from transformers import TFAutoModelForCausalLM +model = TFAutoModelForCausalLM.from_pretrained("username/my_awesome_eli5_clm-model") +outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95) + +Decode the generated token ids back into text: + +tokenizer.batch_decode(outputs, skip_special_tokens=True) +['Somatic hypermutation allows the immune system to detect the presence of other viruses as they become more prevalent. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_67.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..36b441725acf740d0b6fda5da47d1c0d548d1a1a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_67.txt @@ -0,0 +1 @@ +Therefore, researchers have identified a high proportion of human viruses. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_68.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd5f6752eac54d3ed9fc3c7dc1071be8b7015640 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_68.txt @@ -0,0 +1 @@ +The proportion of virus-associated viruses in our study increases with age. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_69.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb6352cd714aff62bdf43ed369e32b40359bbb5b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_69.txt @@ -0,0 +1 @@ +Therefore, we propose a simple algorithm to detect the presence of these new viruses in our samples as a sign of improved immunity. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_language_modeling/chunk_70.txt b/chunked/content_aware_chunking/tasks_language_modeling/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3fe7f1fff6a9df742b57ae33c7c1e42f68b2b94 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_language_modeling/chunk_70.txt @@ -0,0 +1 @@ +A first study based on this algorithm, which will be published in Science on Friday, aims to show that this finding could translate into the development of a better vaccine that is more effective for'] \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_13.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..65edb65a9fb2eff7e1c5dd51d2234fc689a40f14 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_13.txt @@ -0,0 +1,4 @@ +In this guide, you will learn how to: +- Infer in segment everything mode with batching, +- Infer in point prompting mode, +- Infer in box prompting mode. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_14.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c4bac5ced57b0c04ac88d2d5aa4478d2529240b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_14.txt @@ -0,0 +1,5 @@ +First, let's install transformers: + +pip install -q transformers +Mask Generation Pipeline +The easiest way to infer mask generation models is to use the mask-generation pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_15.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b9d529290f4b1d0fcd24f615ef7c20536b59979 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_15.txt @@ -0,0 +1,7 @@ +thon + +from transformers import pipeline +checkpoint = "facebook/sam-vit-base" +mask_generator = pipeline(model=checkpoint, task="mask-generation") + +Let's see the image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_16.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..f93f05e3654c52a0d19c7d5146ed17748fe9fe36 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_16.txt @@ -0,0 +1,7 @@ +thon +from PIL import Image +import requests +img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg" +image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") + +Let's segment everything. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_17.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6d32846b398e79e5afdc565f05ecc493c5feb7d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_17.txt @@ -0,0 +1 @@ +points-per-batch enables parallel inference of points in segment everything mode. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_18.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..89fb7b26bc2690e3b5e9706751b5d09ec0c73912 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_18.txt @@ -0,0 +1 @@ +This enables faster inference, but consumes more memory. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_19.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..478097d4dab1da8ccf82173f9fc20eb5f45de25d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_19.txt @@ -0,0 +1 @@ +Moreover, SAM only enables batching over points and not the images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_20.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..092d2db56312ed34a513b95d9632c48409dbe5d5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_20.txt @@ -0,0 +1 @@ +pred_iou_thresh is the IoU confidence threshold where only the masks above that certain threshold are returned. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_21.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..271e4179652afd2d7121ac49b5e60336b79e538c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_21.txt @@ -0,0 +1,28 @@ +python +masks = mask_generator(image, points_per_batch=128, pred_iou_thresh=0.88) +The masks looks like the following: + +{'masks': [array([[False, False, False, , True, True, True], + [False, False, False, , True, True, True], + [False, False, False, , True, True, True], + , + [False, False, False, , False, False, False], + [False, False, False, , False, False, False], + [False, False, False, , False, False, False]]), + array([[False, False, False, , False, False, False], + [False, False, False, , False, False, False], + [False, False, False, , False, False, False], + , +'scores': tensor([0.9972, 0.9917, + , +} +We can visualize them like this: +thon +import matplotlib.pyplot as plt +plt.imshow(image, cmap='gray') +for i, mask in enumerate(masks["masks"]): + plt.imshow(mask, cmap='viridis', alpha=0.1, vmin=0, vmax=1) +plt.axis('off') +plt.show() + +Below is the original image in grayscale with colorful maps overlaid. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_22.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc09862d330bac75a3ac1c7cdefed55a91b748fc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_22.txt @@ -0,0 +1 @@ +Very impressive. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_23.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..f848ba3c0e98b10185859ade841e42c7f886c61b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_23.txt @@ -0,0 +1,3 @@ +Model Inference +Point Prompting +You can also use the model without the pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_24.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad8e0dfd7476bc83abff859bb146e5564e76976f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_24.txt @@ -0,0 +1,2 @@ +To do so, initialize the model and +the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_25.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..b15ed65dccae8e2bf050b59a972f54eab4aa22c1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_25.txt @@ -0,0 +1,8 @@ +thon +from transformers import SamModel, SamProcessor +device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +model = SamModel.from_pretrained("facebook/sam-vit-base").to(device) +processor = SamProcessor.from_pretrained("facebook/sam-vit-base") + +To do point prompting, pass the input point to the processor, then take the processor output +and pass it to the model for inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_26.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f20f79686dbd177ea29753239603482d761a69d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_26.txt @@ -0,0 +1,2 @@ +To post-process the model output, pass the outputs and +original_sizes and reshaped_input_sizes we take from the processor's initial output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_27.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..61a88c71316e09e60b571a7675de63c2f5f93fa3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_27.txt @@ -0,0 +1,2 @@ +We need to pass these +since the processor resizes the image, and the output needs to be extrapolated. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_28.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cb70e045861f7e1c4ab382380ce4918e77f155f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_28.txt @@ -0,0 +1,8 @@ +thon +input_points = [[[2592, 1728]]] # point location of the bee +inputs = processor(image, input_points=input_points, return_tensors="pt").to(device) +with torch.no_grad(): + outputs = model(**inputs) +masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()) +`` +We can visualize the three masks in themasks` output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_29.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..db58a5f63c309f8d6961e8b2a8f88336464995d9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_29.txt @@ -0,0 +1,23 @@ +thon +import torch +import matplotlib.pyplot as plt +import numpy as np +fig, axes = plt.subplots(1, 4, figsize=(15, 5)) +axes[0].imshow(image) +axes[0].set_title('Original Image') +mask_list = [masks[0][0][0].numpy(), masks[0][0][1].numpy(), masks[0][0][2].numpy()] +for i, mask in enumerate(mask_list, start=1): + overlayed_image = np.array(image).copy() +overlayed_image[:,:,0] = np.where(mask == 1, 255, overlayed_image[:,:,0]) +overlayed_image[:,:,1] = np.where(mask == 1, 0, overlayed_image[:,:,1]) +overlayed_image[:,:,2] = np.where(mask == 1, 0, overlayed_image[:,:,2]) + +axes[i].imshow(overlayed_image) +axes[i].set_title(f'Mask {i}') + +for ax in axes: + ax.axis('off') +plt.show() + +Box Prompting +You can also do box prompting in a similar fashion to point prompting. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_30.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6bb867011c53b4ec646c2d1a05409f886d97d26 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_30.txt @@ -0,0 +1,2 @@ +You can simply pass the input box in the format of a list +[x_min, y_min, x_max, y_max] format along with the image to the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_31.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..22448c460464cb5ccfeda82a464d952587837f0f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_31.txt @@ -0,0 +1,2 @@ +Take the processor output and directly pass it +to the model, then post-process the output again. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_32.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..06cd44bdb6e024153390f8540da0d0397485a3f7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_32.txt @@ -0,0 +1,17 @@ +thon +bounding box around the bee +box = [2350, 1600, 2850, 2100] +inputs = processor( + image, + input_boxes=[[[box]]], + return_tensors="pt" + ).to("cuda") +with torch.no_grad(): + outputs = model(**inputs) +mask = processor.image_processor.post_process_masks( + outputs.pred_masks.cpu(), + inputs["original_sizes"].cpu(), + inputs["reshaped_input_sizes"].cpu() +)[0][0][0].numpy() + +You can visualize the bounding box around the bee as shown below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_33.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..db67fb8025ebfe23dbe11c601c0123087f79e16a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_33.txt @@ -0,0 +1,10 @@ +thon +import matplotlib.patches as patches +fig, ax = plt.subplots() +ax.imshow(image) +rectangle = patches.Rectangle((2350, 1600, 500, 500, linewidth=2, edgecolor='r', facecolor='none') +ax.add_patch(rectangle) +ax.axis("off") +plt.show() + +You can see the inference output below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_mask_generation/chunk_34.txt b/chunked/content_aware_chunking/tasks_mask_generation/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..a468d2356ec6f31b3c9e805271d017bc1b8c2b92 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_mask_generation/chunk_34.txt @@ -0,0 +1,6 @@ +thon +fig, ax = plt.subplots() +ax.imshow(image) +ax.imshow(mask, cmap='viridis', alpha=0.4) +ax.axis("off") +plt.show() \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_24.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..404571eb0a19ad023364df589eaa4ea1f41c3d03 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_24.txt @@ -0,0 +1 @@ +- Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_25.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a46720f027a6ad6aafaa6362719e2dded067032 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_25.txt @@ -0,0 +1,2 @@ +", + 'None yet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_26.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cb1f7bc79e431ad57e87b80eec7bcdc736deca5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_26.txt @@ -0,0 +1 @@ +It has to be reconciled with a vastly different house bill and then passed again. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_27.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..54de386ba1eed8f5718b4e6928bafd18251e5d2e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_27.txt @@ -0,0 +1,2 @@ +', + 'Also: does this apply to 2017 taxes? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_28.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..26c67fdd01031dad9dcb85004c4a80b47cadd9a9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_28.txt @@ -0,0 +1 @@ +Or does it start with 2018 taxes? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_29.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f92792bceca738fcf9e128982926ec3e34d67ed --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_29.txt @@ -0,0 +1,2 @@ +', + 'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_30.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..22894f3d4d59db155ad81ce6898d71ee2b94b777 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_30.txt @@ -0,0 +1,10 @@ +URL_0'], + 'answers.score': [21, 19, 5, 3], + 'answers.text_urls': [[], + [], + [], + ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']], + 'title_urls': ['url'], + 'selftext_urls': ['url']} + +Each subfield is now a separate column as indicated by the answers prefix, and the text field is a list now. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_31.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb783564d5730e5f137a06b44fb36bb2d4598331 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_31.txt @@ -0,0 +1,2 @@ +Instead +of tokenizing each sentence separately, convert the list to a string so you can jointly tokenize them. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_32.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..68bab97b4b20a3d5500bdc4f62cf416e695b01d5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_32.txt @@ -0,0 +1,6 @@ +Here is a first preprocessing function to join the list of strings for each example and tokenize the result: + +def preprocess_function(examples): + return tokenizer([" ".join(x) for x in examples["answers.text"]]) + +To apply this preprocessing function over the entire dataset, use the 🤗 Datasets [~datasets.Dataset.map] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_33.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..4822b855ad8b8c1d8407956723ccef02d018e729 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_33.txt @@ -0,0 +1 @@ +You can speed up the map function by setting batched=True to process multiple elements of the dataset at once, and increasing the number of processes with num_proc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_34.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0f8455941a2ec66e1c2c003a5aae481ba7e4a9b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_34.txt @@ -0,0 +1,10 @@ +Remove any columns you don't need: + +tokenized_eli5 = eli5.map( + preprocess_function, + batched=True, + num_proc=4, + remove_columns=eli5["train"].column_names, + ) + +This dataset contains the token sequences, but some of these are longer than the maximum input length for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_35.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..c25fbe1f6dd8a7571b08bd17ce2133dbfb6a09b2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_35.txt @@ -0,0 +1,3 @@ +You can now use a second preprocessing function to +- concatenate all the sequences +- split the concatenated sequences into shorter chunks defined by block_size, which should be both shorter than the maximum input length and short enough for your GPU RAM. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_36.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa808b8cbd68cce7df99fd606fab83a97b71277f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_36.txt @@ -0,0 +1,3 @@ +block_size = 128 +def group_texts(examples): + # Concatenate all texts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_37.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..65d30289184e0a0e5e2787b15ffd4e8e7c4c3625 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_37.txt @@ -0,0 +1,4 @@ +concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()} + total_length = len(concatenated_examples[list(examples.keys())[0]]) + # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can + # customize this part to your needs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_38.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..81c2774053c75a3e28c8aa8a9496def1b8aa7e9d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_38.txt @@ -0,0 +1,3 @@ +if total_length >= block_size: + total_length = (total_length // block_size) * block_size + # Split by chunks of block_size. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_39.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a4032cc9b65187eaca4b26286708c1025202767 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_39.txt @@ -0,0 +1,11 @@ +result = { + k: [t[i : i + block_size] for i in range(0, total_length, block_size)] + for k, t in concatenated_examples.items() + } + return result + +Apply the group_texts function over the entire dataset: + +lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4) + +Now create a batch of examples using [DataCollatorForLanguageModeling]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_40.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fa2f3dc3281d831e79faf4697f0fba60bd5c93d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_40.txt @@ -0,0 +1 @@ +It's more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_41.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..95b872e9102bdbcc78aab8e75bb58b0cc023e123 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_41.txt @@ -0,0 +1,14 @@ +Use the end-of-sequence token as the padding token and specify mlm_probability to randomly mask tokens each time you iterate over the data: + +from transformers import DataCollatorForLanguageModeling +tokenizer.pad_token = tokenizer.eos_token +data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15) + +Use the end-of-sequence token as the padding token and specify mlm_probability to randomly mask tokens each time you iterate over the data: + +from transformers import DataCollatorForLanguageModeling +data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf") + +Train + +If you aren't familiar with finetuning a model with the [Trainer], take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_42.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..b606df7180c2e2be472d63ede6d17d92a9485791 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_42.txt @@ -0,0 +1 @@ +You're ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_43.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..2814760f2619b004ff414cf97ea5ca1d82afc1cc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_43.txt @@ -0,0 +1,8 @@ +Load DistilRoBERTa with [AutoModelForMaskedLM]: + +from transformers import AutoModelForMaskedLM +model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base") + +At this point, only three steps remain: + +Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_44.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..be198e14862c0c12a30c53e206e6011dc146c468 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_44.txt @@ -0,0 +1 @@ +The only required parameter is output_dir which specifies where to save your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_45.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_45.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_46.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..30319a39626a65879f6dbe581d6130a6d3822707 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_46.txt @@ -0,0 +1 @@ +Pass the training arguments to [Trainer] along with the model, datasets, and data collator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_47.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_47.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_48.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ef4f3cd6d5b32993da20bc4cf26638196ce6648 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_48.txt @@ -0,0 +1,29 @@ +training_args = TrainingArguments( + output_dir="my_awesome_eli5_mlm_model", + evaluation_strategy="epoch", + learning_rate=2e-5, + num_train_epochs=3, + weight_decay=0.01, + push_to_hub=True, + ) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=lm_dataset["train"], + eval_dataset=lm_dataset["test"], + data_collator=data_collator, + ) +trainer.train() + +Once training is completed, use the [~transformers.Trainer.evaluate] method to evaluate your model and get its perplexity: + +import math +eval_results = trainer.evaluate() +print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}") +Perplexity: 8.76 + +Then share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_49.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..4060c8082e316229320c5c812d79d3a659dde159 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_49.txt @@ -0,0 +1,26 @@ +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +from transformers import create_optimizer, AdamWeightDecay +optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) + +Then you can load DistilRoBERTa with [TFAutoModelForMaskedLM]: + +from transformers import TFAutoModelForMaskedLM +model = TFAutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base") + +Convert your datasets to the tf.data.Dataset format with [~transformers.TFPreTrainedModel.prepare_tf_dataset]: + +tf_train_set = model.prepare_tf_dataset( + lm_dataset["train"], + shuffle=True, + batch_size=16, + collate_fn=data_collator, + ) +tf_test_set = model.prepare_tf_dataset( + lm_dataset["test"], + shuffle=False, + batch_size=16, + collate_fn=data_collator, + ) + +Configure the model for training with compile. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_50.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cbbc39c9ffbdd5fa9f0d5eeb9a57372b57f83cd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_50.txt @@ -0,0 +1,4 @@ +Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +import tensorflow as tf +model.compile(optimizer=optimizer) # No loss argument! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_51.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..84b66f78ec5d8a84d6c4045b8fc2c72087bf56dd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_51.txt @@ -0,0 +1,9 @@ +This can be done by specifying where to push your model and tokenizer in the [~transformers.PushToHubCallback]: + +from transformers.keras_callbacks import PushToHubCallback +callback = PushToHubCallback( + output_dir="my_awesome_eli5_mlm_model", + tokenizer=tokenizer, + ) + +Finally, you're ready to start training your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_52.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0fc51dbe0099df22455879ebf4807adfceacba6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_52.txt @@ -0,0 +1,5 @@ +Call fit with your training and validation datasets, the number of epochs, and your callback to finetune the model: + +model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback]) + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_53.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..28e56fcc45c1669423cfbe9f410c0c28abb3ca4c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_53.txt @@ -0,0 +1,3 @@ +For a more in-depth example of how to finetune a model for masked language modeling, take a look at the corresponding +PyTorch notebook +or TensorFlow notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_54.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5ec9281024b79527faf7b28b94415e6100aee5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_54.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've finetuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_55.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..adc6f96576692f885e6da7be9e0b2b763acfd4fd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_55.txt @@ -0,0 +1,3 @@ +Come up with some text you'd like the model to fill in the blank with, and use the special token to indicate the blank: + +text = "The Milky Way is a galaxy." \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_56.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c6dead06229ef8e450c4bcfad5f707e61359e29 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_56.txt @@ -0,0 +1 @@ +The simplest way to try out your finetuned model for inference is to use it in a [pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_57.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..1591b7128c317b93e5463cd5a8c064f1516dc5c1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_57.txt @@ -0,0 +1 @@ +Instantiate a pipeline for fill-mask with your model, and pass your text to it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_58.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..5da79367d4b53435bec934d546f47eac369ce175 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_58.txt @@ -0,0 +1,9 @@ +If you like, you can use the top_k parameter to specify how many predictions to return: + +from transformers import pipeline +mask_filler = pipeline("fill-mask", "username/my_awesome_eli5_mlm_model") +mask_filler(text, top_k=3) +[{'score': 0.5150994658470154, + 'token': 21300, + 'token_str': ' spiral', + 'sequence': 'The Milky Way is a spiral galaxy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_59.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebcb6a4af53831b45ef2f400738628c7cf38d6bf --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_59.txt @@ -0,0 +1,5 @@ +'}, + {'score': 0.07087188959121704, + 'token': 2232, + 'token_str': ' massive', + 'sequence': 'The Milky Way is a massive galaxy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_60.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6ebd9c42ab17a8b7c497a5d6fcaf7f690a5ce76 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_60.txt @@ -0,0 +1,5 @@ +'}, + {'score': 0.06434620916843414, + 'token': 650, + 'token_str': ' small', + 'sequence': 'The Milky Way is a small galaxy.'}] \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_61.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..74e6eb5c0956013de64672a5b6f11c1198bba9ac --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_61.txt @@ -0,0 +1 @@ +Tokenize the text and return the input_ids as PyTorch tensors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_62.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1d25fc02eb6934f49a5f0ace01ae4b0b11e8d48 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_62.txt @@ -0,0 +1,20 @@ +You'll also need to specify the position of the token: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_mlm_model") +inputs = tokenizer(text, return_tensors="pt") +mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1] + +Pass your inputs to the model and return the logits of the masked token: + +from transformers import AutoModelForMaskedLM +model = AutoModelForMaskedLM.from_pretrained("username/my_awesome_eli5_mlm_model") +logits = model(**inputs).logits +mask_token_logits = logits[0, mask_token_index, :] + +Then return the three masked tokens with the highest probability and print them out: + +top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist() +for token in top_3_tokens: + print(text.replace(tokenizer.mask_token, tokenizer.decode([token]))) +The Milky Way is a spiral galaxy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_63.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..7edd970f16c7276c415860d788bffbbf668a74de --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_63.txt @@ -0,0 +1 @@ +The Milky Way is a massive galaxy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_64.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..3847bf09e8770a4852db26dad5a22482f5131a22 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_64.txt @@ -0,0 +1 @@ +The Milky Way is a small galaxy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_65.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe8ac570ee9caa8001ac1535cbb060419fe3cc10 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_65.txt @@ -0,0 +1,4 @@ +`` + + +Tokenize the text and return theinput_idsas TensorFlow tensors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_66.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..c69cd84519c3f4343193a33325edec946a4dc305 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_66.txt @@ -0,0 +1,20 @@ +You'll also need to specify the position of the` token: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_mlm_model") +inputs = tokenizer(text, return_tensors="tf") +mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1] + +Pass your inputs to the model and return the logits of the masked token: + +from transformers import TFAutoModelForMaskedLM +model = TFAutoModelForMaskedLM.from_pretrained("username/my_awesome_eli5_mlm_model") +logits = model(**inputs).logits +mask_token_logits = logits[0, mask_token_index, :] + +Then return the three masked tokens with the highest probability and print them out: + +top_3_tokens = tf.math.top_k(mask_token_logits, 3).indices.numpy() +for token in top_3_tokens: + print(text.replace(tokenizer.mask_token, tokenizer.decode([token]))) +The Milky Way is a spiral galaxy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_67.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..7edd970f16c7276c415860d788bffbbf668a74de --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_67.txt @@ -0,0 +1 @@ +The Milky Way is a massive galaxy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_68.txt b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..3847bf09e8770a4852db26dad5a22482f5131a22 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_masked_language_modeling/chunk_68.txt @@ -0,0 +1 @@ +The Milky Way is a small galaxy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_3.txt b/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..e77822330443f99dc183a266591a982fab6d2857 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_3.txt @@ -0,0 +1,2 @@ +The first one, called predicted_depth, is a tensor with the values +being the depth expressed in meters for each pixel. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_4.txt b/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..2719d02900279e22ee3ac0b7bd5b940777017e35 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_4.txt @@ -0,0 +1 @@ +The second one, depth, is a PIL image that visualizes the depth estimation result. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_5.txt b/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..a757c4cb25a813685693c60d9d2526daa5c1c3ae --- /dev/null +++ b/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_5.txt @@ -0,0 +1,6 @@ +Let's take a look at the visualized result: + +predictions["depth"] + +Depth estimation inference by hand +Now that you've seen how to use the depth estimation pipeline, let's see how we can replicate the same result by hand. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_6.txt b/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d101059fad2e8f365be3ab979e2ddcd6db418ff --- /dev/null +++ b/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_6.txt @@ -0,0 +1 @@ +Start by loading the model and associated processor from a checkpoint on the Hugging Face Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_7.txt b/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3b2bcc2400698546a1869804c863c5a5250266a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_monocular_depth_estimation/chunk_7.txt @@ -0,0 +1,33 @@ +Here we'll use the same checkpoint as before: + +from transformers import AutoImageProcessor, AutoModelForDepthEstimation +checkpoint = "vinvino02/glpn-nyu" +image_processor = AutoImageProcessor.from_pretrained(checkpoint) +model = AutoModelForDepthEstimation.from_pretrained(checkpoint) + +Prepare the image input for the model using the image_processor that will take care of the necessary image transformations +such as resizing and normalization: + +pixel_values = image_processor(image, return_tensors="pt").pixel_values + +Pass the prepared inputs through the model: + +import torch +with torch.no_grad(): + outputs = model(pixel_values) + predicted_depth = outputs.predicted_depth + +Visualize the results: + +import numpy as np +interpolate to original size +prediction = torch.nn.functional.interpolate( + predicted_depth.unsqueeze(1), + size=image.size[::-1], + mode="bicubic", + align_corners=False, + ).squeeze() +output = prediction.numpy() +formatted = (output * 255 / np.max(output)).astype("uint8") +depth = Image.fromarray(formatted) +depth \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_12.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..84e073e71451abad6d5411c215ade8456a11e2fe --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_12.txt @@ -0,0 +1 @@ +Flatten these two lists so you can tokenize them, and then unflatten them afterward so each example has a corresponding input_ids, attention_mask, and labels field. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_13.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..6559dc975a01f5f732c548cad805f7deb3c3931a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_13.txt @@ -0,0 +1,14 @@ +ending_names = ["ending0", "ending1", "ending2", "ending3"] +def preprocess_function(examples): + first_sentences = [[context] * 4 for context in examples["sent1"]] + question_headers = examples["sent2"] + second_sentences = [ + [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers) + ] + + first_sentences = sum(first_sentences, []) + second_sentences = sum(second_sentences, []) + tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True) + return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()} + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [~datasets.Dataset.map] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_14.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c96e9a226f60f176fdc3e4a0888e0b670400221 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_14.txt @@ -0,0 +1,4 @@ +You can speed up the map function by setting batched=True to process multiple elements of the dataset at once: +py +tokenized_swag = swag.map(preprocess_function, batched=True) +🤗 Transformers doesn't have a data collator for multiple choice, so you'll need to adapt the [DataCollatorWithPadding] to create a batch of examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_15.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fa2f3dc3281d831e79faf4697f0fba60bd5c93d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_15.txt @@ -0,0 +1 @@ +It's more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_16.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..9133b2d4d9d6c36c648fbb6fb034a1879eda07a6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_16.txt @@ -0,0 +1,11 @@ +DataCollatorForMultipleChoice flattens all the model inputs, applies padding, and then unflattens the results: + +from dataclasses import dataclass +from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +from typing import Optional, Union +import torch +@dataclass + class DataCollatorForMultipleChoice: + """ + Data collator that will dynamically pad the inputs for multiple choice received. + """ \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_17.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..f82cbc71799b1d8c31bbb1cd855e321d9d3389cc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_17.txt @@ -0,0 +1,35 @@ +tokenizer: PreTrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + def call(self, features): + label_name = "label" if "label" in features[0].keys() else "labels" + labels = [feature.pop(label_name) for feature in features] + batch_size = len(features) + num_choices = len(features[0]["input_ids"]) + flattened_features = [ + [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features + ] + flattened_features = sum(flattened_features, []) + batch = self.tokenizer.pad( + flattened_features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors="pt", + ) + batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()} + batch["labels"] = torch.tensor(labels, dtype=torch.int64) + return batch + +py + +from dataclasses import dataclass +from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy +from typing import Optional, Union +import tensorflow as tf +@dataclass + class DataCollatorForMultipleChoice: + """ + Data collator that will dynamically pad the inputs for multiple choice received. + """ \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_18.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..036a50421ded35f9499fcc910447f4e9ce8094b9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_18.txt @@ -0,0 +1,26 @@ +tokenizer: PreTrainedTokenizerBase + padding: Union[bool, str, PaddingStrategy] = True + max_length: Optional[int] = None + pad_to_multiple_of: Optional[int] = None + def call(self, features): + label_name = "label" if "label" in features[0].keys() else "labels" + labels = [feature.pop(label_name) for feature in features] + batch_size = len(features) + num_choices = len(features[0]["input_ids"]) + flattened_features = [ + [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features + ] + flattened_features = sum(flattened_features, []) + batch = self.tokenizer.pad( + flattened_features, + padding=self.padding, + max_length=self.max_length, + pad_to_multiple_of=self.pad_to_multiple_of, + return_tensors="tf", + ) + batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()} + batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64) + return batch + +Evaluate +Including a metric during training is often helpful for evaluating your model's performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_19.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb54a12b788c6744523d6aed7df8543dd99480f2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_19.txt @@ -0,0 +1 @@ +You can quickly load a evaluation method with the 🤗 Evaluate library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_20.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d5f147cfbfa8389628e615d4376203a58f3efd5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_20.txt @@ -0,0 +1,14 @@ +For this task, load the accuracy metric (see the 🤗 Evaluate quick tour to learn more about how to load and compute a metric): + +import evaluate +accuracy = evaluate.load("accuracy") + +Then create a function that passes your predictions and labels to [~evaluate.EvaluationModule.compute] to calculate the accuracy: + +import numpy as np +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + return accuracy.compute(predictions=predictions, references=labels) + +Your compute_metrics function is ready to go now, and you'll return to it when you setup your training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_21.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f493faa6fd8490ca5e6198c99a80a5e6e236cf1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_21.txt @@ -0,0 +1,3 @@ +Train + +If you aren't familiar with finetuning a model with the [Trainer], take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_22.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..b606df7180c2e2be472d63ede6d17d92a9485791 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_22.txt @@ -0,0 +1 @@ +You're ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_23.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..57147f11012bc30652588ce6304a36dc3d9a185d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_23.txt @@ -0,0 +1,8 @@ +Load BERT with [AutoModelForMultipleChoice]: + +from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer +model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased") + +At this point, only three steps remain: + +Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_24.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..be198e14862c0c12a30c53e206e6011dc146c468 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_24.txt @@ -0,0 +1 @@ +The only required parameter is output_dir which specifies where to save your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_25.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_25.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_26.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c977d29a14cc4699a4b06a477bb52c2490bdc8e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_26.txt @@ -0,0 +1 @@ +At the end of each epoch, the [Trainer] will evaluate the accuracy and save the training checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_27.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..643c2a1221f0298c2d0da3f12550f92d36f0b4a3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_27.txt @@ -0,0 +1 @@ +Pass the training arguments to [Trainer] along with the model, dataset, tokenizer, data collator, and compute_metrics function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_28.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_28.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_29.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad39feaa446fe943be36756c93715953073839f0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_29.txt @@ -0,0 +1,28 @@ +training_args = TrainingArguments( + output_dir="my_awesome_swag_model", + evaluation_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + learning_rate=5e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=3, + weight_decay=0.01, + push_to_hub=True, + ) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_swag["train"], + eval_dataset=tokenized_swag["validation"], + tokenizer=tokenizer, + data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer), + compute_metrics=compute_metrics, + ) +trainer.train() + +Once training is completed, share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_30.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bec0e1526659a7194c78a75d4131ed2be561c83 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_30.txt @@ -0,0 +1,30 @@ +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +from transformers import create_optimizer +batch_size = 16 +num_train_epochs = 2 +total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs +optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps) + +Then you can load BERT with [TFAutoModelForMultipleChoice]: + +from transformers import TFAutoModelForMultipleChoice +model = TFAutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased") + +Convert your datasets to the tf.data.Dataset format with [~transformers.TFPreTrainedModel.prepare_tf_dataset]: + +data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer) +tf_train_set = model.prepare_tf_dataset( + tokenized_swag["train"], + shuffle=True, + batch_size=batch_size, + collate_fn=data_collator, + ) +tf_validation_set = model.prepare_tf_dataset( + tokenized_swag["validation"], + shuffle=False, + batch_size=batch_size, + collate_fn=data_collator, + ) + +Configure the model for training with compile. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_31.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..c81916ab8b486272429671ce31cf4063f1711dd9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_31.txt @@ -0,0 +1,3 @@ +Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +model.compile(optimizer=optimizer) # No loss argument! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_32.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..778523a4558517d06c3b5f23ba5eeb1897e7b0ef --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_32.txt @@ -0,0 +1 @@ +The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_33.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..f69fb6436e3e5a775a94edf0a0ecab9dd3584f21 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_33.txt @@ -0,0 +1 @@ +Both are done by using Keras callbacks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_34.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ba3cbd8b5ad819ce967d3055a9ef00ea5ee5021 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_34.txt @@ -0,0 +1,18 @@ +Pass your compute_metrics function to [~transformers.KerasMetricCallback]: + +from transformers.keras_callbacks import KerasMetricCallback +metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) + +Specify where to push your model and tokenizer in the [~transformers.PushToHubCallback]: + +from transformers.keras_callbacks import PushToHubCallback +push_to_hub_callback = PushToHubCallback( + output_dir="my_awesome_model", + tokenizer=tokenizer, + ) + +Then bundle your callbacks together: + +callbacks = [metric_callback, push_to_hub_callback] + +Finally, you're ready to start training your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_35.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..d015b158b6bdca4e79f697ab63fae874c1898ce4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_35.txt @@ -0,0 +1,5 @@ +Call fit with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: + +model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks) + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_36.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9307630f2c591ca75df8e008cad0a8e87de310c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_36.txt @@ -0,0 +1,3 @@ +For a more in-depth example of how to finetune a model for multiple choice, take a look at the corresponding +PyTorch notebook +or TensorFlow notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_37.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5ec9281024b79527faf7b28b94415e6100aee5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_37.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've finetuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_38.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d3188dbdd411af613b237b89fa01642ed3e8323 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_38.txt @@ -0,0 +1,3 @@ +Come up with some text and two candidate answers: + +prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette." \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_39.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a82b104a77f1422815622b396a71d8728bb70fd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_39.txt @@ -0,0 +1 @@ +candidate1 = "The law does not apply to croissants and brioche." \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_40.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..897a0dc657bd2f118d99285497cc412e7e8a030b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_40.txt @@ -0,0 +1 @@ +candidate2 = "The law applies to baguettes." \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_41.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c36a6239b0074b8adb93a5e375a15e6a29c0715 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_41.txt @@ -0,0 +1 @@ +Tokenize each prompt and candidate answer pair and return PyTorch tensors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_multiple_choice/chunk_42.txt b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..77d9b5b9e454b01c93ecf4a64775d90fd12b1f77 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_multiple_choice/chunk_42.txt @@ -0,0 +1,39 @@ +You should also create some labels: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model") +inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True) +labels = torch.tensor(0).unsqueeze(0) + +Pass your inputs and labels to the model and return the logits: + +from transformers import AutoModelForMultipleChoice +model = AutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model") +outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels) +logits = outputs.logits + +Get the class with the highest probability: + +predicted_class = logits.argmax().item() +predicted_class +'0' + +Tokenize each prompt and candidate answer pair and return TensorFlow tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("my_awesome_swag_model") +inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True) + +Pass your inputs to the model and return the logits: + +from transformers import TFAutoModelForMultipleChoice +model = TFAutoModelForMultipleChoice.from_pretrained("my_awesome_swag_model") +inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()} +outputs = model(inputs) +logits = outputs.logits + +Get the class with the highest probability: + +predicted_class = int(tf.math.argmax(logits, axis=-1)[0]) +predicted_class +'0' \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_25.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..53fbe5a5d1079132ac24e53e74191adb41d8d281 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_25.txt @@ -0,0 +1,2 @@ +These values are crucial +to replicate when doing inference or finetuning a pre-trained image model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_26.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc823830762a99d1ffde9b6aeabef0f34a03641c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_26.txt @@ -0,0 +1 @@ +Instantiate the image processor from the same checkpoint as the model you want to finetune. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_27.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..84022caeec8f4934bb96b858876cb3c096aaf051 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_27.txt @@ -0,0 +1,8 @@ +from transformers import AutoImageProcessor +checkpoint = "facebook/detr-resnet-50" +image_processor = AutoImageProcessor.from_pretrained(checkpoint) + +Before passing the images to the image_processor, apply two preprocessing transformations to the dataset: +- Augmenting images +- Reformatting annotations to meet DETR expectations +First, to make sure the model does not overfit on the training data, you can apply image augmentation with any data augmentation library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_28.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..22a3826329a6d1081a2e1ee31eb3d2d62c855945 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_28.txt @@ -0,0 +1,2 @@ +Here we use Albumentations +This library ensures that transformations affect the image and update the bounding boxes accordingly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_29.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..0449e5f7293d2a95d984c3f856d8c1ad715b6f1e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_29.txt @@ -0,0 +1,2 @@ +The 🤗 Datasets library documentation has a detailed guide on how to augment images for object detection, +and it uses the exact same dataset as an example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_30.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0174c5b941823022654b31f76d65ad735956043 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_30.txt @@ -0,0 +1,17 @@ +Apply the same approach here, resize each image to (480, 480), +flip it horizontally, and brighten it: + +import albumentations +import numpy as np +import torch +transform = albumentations.Compose( + [ + albumentations.Resize(480, 480), + albumentations.HorizontalFlip(p=1.0), + albumentations.RandomBrightnessContrast(p=1.0), + ], + bbox_params=albumentations.BboxParams(format="coco", label_fields=["category"]), + ) + +The image_processor expects the annotations to be in the following format: {'image_id': int, 'annotations': List[Dict]}, + where each dictionary is a COCO object annotation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_31.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ee337bacde4697ff8f3f8ea1cb5be41712670e7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_31.txt @@ -0,0 +1,37 @@ +Let's add a function to reformat annotations for a single example: + +def formatted_anns(image_id, category, area, bbox): + annotations = [] + for i in range(0, len(category)): + new_ann = { + "image_id": image_id, + "category_id": category[i], + "isCrowd": 0, + "area": area[i], + "bbox": list(bbox[i]), + } + annotations.append(new_ann) + + return annotations + +Now you can combine the image and annotation transformations to use on a batch of examples: + +transforming a batch +def transform_aug_ann(examples): + image_ids = examples["image_id"] + images, bboxes, area, categories = [], [], [], [] + for image, objects in zip(examples["image"], examples["objects"]): + image = np.array(image.convert("RGB"))[:, :, ::-1] + out = transform(image=image, bboxes=objects["bbox"], category=objects["category"]) + + area.append(objects["area"]) + images.append(out["image"]) + bboxes.append(out["bboxes"]) + categories.append(out["category"]) + targets = [ + {"image_id": id_, "annotations": formatted_anns(id_, cat_, ar_, box_)} + for id_, cat_, ar_, box_ in zip(image_ids, categories, area, bboxes) + ] + return image_processor(images=images, annotations=targets, return_tensors="pt") + +Apply this preprocessing function to the entire dataset using 🤗 Datasets [~datasets.Dataset.with_transform] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_32.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..6da942e2f10ce35c17f403ec87f4f8b4be6ab81f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_32.txt @@ -0,0 +1,2 @@ +This method applies +transformations on the fly when you load an element of the dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_33.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..2264e68fe9a290fe51e1df2bece801d73537fbc2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_33.txt @@ -0,0 +1 @@ +At this point, you can check what an example from the dataset looks like after the transformations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_34.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7d8acd528400483256a4d68774f2fd823ec17c9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_34.txt @@ -0,0 +1,2 @@ +You should see a tensor +with pixel_values, a tensor with pixel_mask, and labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_35.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..39ab6b75ad1937588c66a07c15ee3b7385a83a0d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_35.txt @@ -0,0 +1,36 @@ +cppe5["train"] = cppe5["train"].with_transform(transform_aug_ann) +cppe5["train"][15] +{'pixel_values': tensor([[[ 0.9132, 0.9132, 0.9132, , -1.9809, -1.9809, -1.9809], + [ 0.9132, 0.9132, 0.9132, , -1.9809, -1.9809, -1.9809], + [ 0.9132, 0.9132, 0.9132, , -1.9638, -1.9638, -1.9638], + , + [-1.5699, -1.5699, -1.5699, , -1.9980, -1.9980, -1.9980], + [-1.5528, -1.5528, -1.5528, , -1.9980, -1.9809, -1.9809], + [-1.5528, -1.5528, -1.5528, , -1.9980, -1.9809, -1.9809]], + + [[ 1.3081, 1.3081, 1.3081, , -1.8431, -1.8431, -1.8431], + [ 1.3081, 1.3081, 1.3081, , -1.8431, -1.8431, -1.8431], + [ 1.3081, 1.3081, 1.3081, , -1.8256, -1.8256, -1.8256], + , + [-1.3179, -1.3179, -1.3179, , -1.8606, -1.8606, -1.8606], + [-1.3004, -1.3004, -1.3004, , -1.8606, -1.8431, -1.8431], + [-1.3004, -1.3004, -1.3004, , -1.8606, -1.8431, -1.8431]], + + [[ 1.4200, 1.4200, 1.4200, , -1.6476, -1.6476, -1.6476], + [ 1.4200, 1.4200, 1.4200, , -1.6476, -1.6476, -1.6476], + [ 1.4200, 1.4200, 1.4200, , -1.6302, -1.6302, -1.6302], + , + [-1.0201, -1.0201, -1.0201, , -1.5604, -1.5604, -1.5604], + [-1.0027, -1.0027, -1.0027, , -1.5604, -1.5430, -1.5430], + [-1.0027, -1.0027, -1.0027, , -1.5604, -1.5430, -1.5430]]]), + +'pixel_mask': tensor([[1, 1, 1, , 1, 1, 1], + [1, 1, 1, , 1, 1, 1], + [1, 1, 1, , 1, 1, 1], + , + [1, 1, 1, , 1, 1, 1], + [1, 1, 1, , 1, 1, 1], + [1, 1, 1, , 1, 1, 1]]), + 'labels': {'size': tensor([800, 800]), 'image_id': tensor([756]), 'class_labels': tensor([4]), 'boxes': tensor([[0.7340, 0.6986, 0.3414, 0.5944]]), 'area': tensor([519544.4375]), 'iscrowd': tensor([0]), 'orig_size': tensor([480, 480])}} + +You have successfully augmented the individual images and prepared their annotations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_36.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..173431fee450c939ebad2e4c96cb7e5c9ee976dc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_36.txt @@ -0,0 +1,2 @@ +However, preprocessing isn't +complete yet. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_37.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0055a83321c3bc4a2ab8c98fa57aec0eef94eff --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_37.txt @@ -0,0 +1 @@ +In the final step, create a custom collate_fn to batch images together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_38.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..b904f5364201f5ee702c76ed788b431adc642e6f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_38.txt @@ -0,0 +1,2 @@ +Pad images (which are now pixel_values) to the largest image in a batch, and create a corresponding pixel_mask +to indicate which pixels are real (1) and which are padding (0). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_39.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..74e88016843f8914db02035490b6cee3eade14c3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_39.txt @@ -0,0 +1,12 @@ +def collate_fn(batch): + pixel_values = [item["pixel_values"] for item in batch] + encoding = image_processor.pad(pixel_values, return_tensors="pt") + labels = [item["labels"] for item in batch] + batch = {} + batch["pixel_values"] = encoding["pixel_values"] + batch["pixel_mask"] = encoding["pixel_mask"] + batch["labels"] = labels + return batch + +Training the DETR model +You have done most of the heavy lifting in the previous sections, so now you are ready to train your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_40.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..47e2dcf8ba0e95f1e0b0af6e14d23b9b52d8d644 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_40.txt @@ -0,0 +1 @@ +The images in this dataset are still quite large, even after resizing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_41.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1f68a9451067ff9c1b6432db7e7fdc72c58616d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_41.txt @@ -0,0 +1,2 @@ +This means that finetuning this model will +require at least one GPU. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_42.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc1c4bb3d387c127db03d7f17f9e8ebf1491fd5e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_42.txt @@ -0,0 +1,2 @@ +Training involves the following steps: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_43.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d615f28c6980dccbe51b8f33b93e25e8f02ca9f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_43.txt @@ -0,0 +1 @@ +Load the model with [AutoModelForObjectDetection] using the same checkpoint as in the preprocessing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_44.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_44.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_45.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e33f9173799a9c64b3a0fa07d75bc3fcded6822 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_45.txt @@ -0,0 +1 @@ +Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_46.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_46.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_47.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..0964a89d78fd804b744f2e11743909639f3377fa --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_47.txt @@ -0,0 +1 @@ +Pass the training arguments to [Trainer] along with the model, dataset, image processor, and data collator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_48.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd249e29f9fefd5ef6445828f1394f228bd97f1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_48.txt @@ -0,0 +1 @@ +4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_49.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_49.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_50.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..05201a6842074d836b910e7bb95b2b11be08f185 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_50.txt @@ -0,0 +1,2 @@ +When loading the model from the same checkpoint that you used for the preprocessing, remember to pass the label2id +and id2label maps that you created earlier from the dataset's metadata. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_51.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..67b6a7e7747335332b2cda7876f7fdc0a3bffb06 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_51.txt @@ -0,0 +1 @@ +Additionally, we specify ignore_mismatched_sizes=True to replace the existing classification head with a new one. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_52.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..fde26135e7fc9026e1a1b4e436f66e4193b850e9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_52.txt @@ -0,0 +1,9 @@ +from transformers import AutoModelForObjectDetection +model = AutoModelForObjectDetection.from_pretrained( + checkpoint, + id2label=id2label, + label2id=label2id, + ignore_mismatched_sizes=True, + ) + +In the [TrainingArguments] use output_dir to specify where to save your model, then configure hyperparameters as you see fit. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_53.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b7c547c0283539544ec2453f076bb48b22a190b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_53.txt @@ -0,0 +1 @@ +It is important you do not remove unused columns because this will drop the image column. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_54.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..18bb737b259476c0fa589731b3e817463473e7da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_54.txt @@ -0,0 +1,2 @@ +Without the image column, you +can't create pixel_values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_55.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..f92d38d322cf57ec177af7a8200237574ec0a74a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_55.txt @@ -0,0 +1 @@ +For this reason, set remove_unused_columns to False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_56.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..b656549947eca23364c1153690da1449debfe985 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_56.txt @@ -0,0 +1,2 @@ +If you wish to share your model by pushing to the Hub, set push_to_hub to True (you must be signed in to Hugging +Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_57.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..38bdc29e8d55a2c882435e0317122dbfa85d2959 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_57.txt @@ -0,0 +1,29 @@ +from transformers import TrainingArguments +training_args = TrainingArguments( + output_dir="detr-resnet-50_finetuned_cppe5", + per_device_train_batch_size=8, + num_train_epochs=10, + fp16=True, + save_steps=200, + logging_steps=50, + learning_rate=1e-5, + weight_decay=1e-4, + save_total_limit=2, + remove_unused_columns=False, + push_to_hub=True, + ) + +Finally, bring everything together, and call [~transformers.Trainer.train]: + +from transformers import Trainer +trainer = Trainer( + model=model, + args=training_args, + data_collator=collate_fn, + train_dataset=cppe5["train"], + tokenizer=image_processor, + ) +trainer.train() + +If you have set push_to_hub to True in the training_args, the training checkpoints are pushed to the +Hugging Face Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_58.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1d0a5e081d84c6be47c3e5b584b752b6b6553fa --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_58.txt @@ -0,0 +1 @@ +Upon training completion, push the final model to the Hub as well by calling the [~transformers.Trainer.push_to_hub] method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_59.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c2752c8ca23a631ecf69835343a053e41381c26 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_59.txt @@ -0,0 +1,4 @@ +trainer.push_to_hub() + +Evaluate +Object detection models are commonly evaluated with a set of COCO-style metrics. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_60.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2665c0da534ee054f09375e55a242431a1f70ea --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_60.txt @@ -0,0 +1,2 @@ +You can use one of the existing metrics implementations, but here you'll use the one from torchvision to evaluate the final +model that you pushed to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_61.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..aaf7ad36a612d76eaa654c0bf24109beb31e3c6f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_61.txt @@ -0,0 +1 @@ +To use the torchvision evaluator, you'll need to prepare a ground truth COCO dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_62.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..28deef3869787296981d41963c89b9e1fff982f8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_62.txt @@ -0,0 +1,2 @@ +The API to build a COCO dataset +requires the data to be stored in a certain format, so you'll need to save images and annotations to disk first. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_63.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..dddd4040736473596782e27e139135cc251e7b43 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_63.txt @@ -0,0 +1,2 @@ +Just like +when you prepared your data for training, the annotations from the cppe5["test"] need to be formatted. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_64.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd7fbba43c026c1853c0c5ca42123928d6ce810d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_64.txt @@ -0,0 +1,2 @@ +However, images +should stay as they are. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_65.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a5c51db5ddd329c9379c5509b4b9e25be2a13af --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_65.txt @@ -0,0 +1 @@ +The evaluation step requires a bit of work, but it can be split in three major steps. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_66.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..84fde31a848fbda074caf2cfffd6ef280d9852f8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_66.txt @@ -0,0 +1 @@ +First, prepare the cppe5["test"] set: format the annotations and save the data to disk. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_67.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..51f687b575b30914e8ad8a03352df110e133282c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_67.txt @@ -0,0 +1,48 @@ +import json +format annotations the same as for training, no need for data augmentation +def val_formatted_anns(image_id, objects): + annotations = [] + for i in range(0, len(objects["id"])): + new_ann = { + "id": objects["id"][i], + "category_id": objects["category"][i], + "iscrowd": 0, + "image_id": image_id, + "area": objects["area"][i], + "bbox": objects["bbox"][i], + } + annotations.append(new_ann) + + return annotations + +Save images and annotations into the files torchvision.datasets.CocoDetection expects +def save_cppe5_annotation_file_images(cppe5): + output_json = {} + path_output_cppe5 = f"{os.getcwd()}/cppe5/" + + if not os.path.exists(path_output_cppe5): + os.makedirs(path_output_cppe5) + path_anno = os.path.join(path_output_cppe5, "cppe5_ann.json") + categories_json = [{"supercategory": "none", "id": id, "name": id2label[id]} for id in id2label] + output_json["images"] = [] + output_json["annotations"] = [] + for example in cppe5: + ann = val_formatted_anns(example["image_id"], example["objects"]) + output_json["images"].append( + { + "id": example["image_id"], + "width": example["image"].width, + "height": example["image"].height, + "file_name": f"{example['image_id']}.png", + } + ) + output_json["annotations"].extend(ann) + output_json["categories"] = categories_json + with open(path_anno, "w") as file: + json.dump(output_json, file, ensure_ascii=False, indent=4) + for im, img_id in zip(cppe5["image"], cppe5["image_id"]): + path_img = os.path.join(path_output_cppe5, f"{img_id}.png") + im.save(path_img) + return path_output_cppe5, path_anno + +Next, prepare an instance of a CocoDetection class that can be used with cocoevaluator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_68.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..7801a12dbdc1b459fd40366ab7c1a7ff1968fe70 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_68.txt @@ -0,0 +1,23 @@ +import torchvision +class CocoDetection(torchvision.datasets.CocoDetection): + def init(self, img_folder, image_processor, ann_file): + super().init(img_folder, ann_file) + self.image_processor = image_processor + + def getitem(self, idx): + # read in PIL image and target in COCO format + img, target = super(CocoDetection, self).getitem(idx) + # preprocess image and target: converting target to DETR format, + # resizing + normalization of both image and target) + image_id = self.ids[idx] + target = {"image_id": image_id, "annotations": target} + encoding = self.image_processor(images=img, annotations=target, return_tensors="pt") + pixel_values = encoding["pixel_values"].squeeze() # remove batch dimension + target = encoding["labels"][0] # remove batch dimension + return {"pixel_values": pixel_values, "labels": target} + +im_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5") +path_output_cppe5, path_anno = save_cppe5_annotation_file_images(cppe5["test"]) +test_ds_coco_format = CocoDetection(path_output_cppe5, im_processor, path_anno) + +Finally, load the metrics and run the evaluation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_69.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..468931fd1a34a76f657c8c9312ea5caea88fd9e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_69.txt @@ -0,0 +1,26 @@ +import evaluate +from tqdm import tqdm +model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5") +module = evaluate.load("ybelkada/cocoevaluate", coco=test_ds_coco_format.coco) +val_dataloader = torch.utils.data.DataLoader( + test_ds_coco_format, batch_size=8, shuffle=False, num_workers=4, collate_fn=collate_fn + ) +with torch.no_grad(): + for idx, batch in enumerate(tqdm(val_dataloader)): + pixel_values = batch["pixel_values"] + pixel_mask = batch["pixel_mask"] + + labels = [ + {k: v for k, v in t.items()} for t in batch["labels"] + ] # these are in DETR format, resized + normalized + # forward pass + outputs = model(pixel_values=pixel_values, pixel_mask=pixel_mask) + orig_target_sizes = torch.stack([target["orig_size"] for target in labels], dim=0) + results = im_processor.post_process(outputs, orig_target_sizes) # convert outputs of model to Pascal VOC format (xmin, ymin, xmax, ymax) + module.add(prediction=results, reference=labels) + del batch + +results = module.compute() +print(results) +Accumulating evaluation results +DONE (t=0.08s). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_70.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..6916b33cc67bdb75a7b4d93c2f29c6911a22b65f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_70.txt @@ -0,0 +1,15 @@ +IoU metric: bbox + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.352 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.681 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.292 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.168 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.208 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.429 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.274 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.484 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.501 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.191 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.323 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.590 +`` +These results can be further improved by adjusting the hyperparameters in [~transformers.TrainingArguments`]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_71.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..4435b1b9366126ac25582fcf251067c80afb84bf --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_71.txt @@ -0,0 +1 @@ +Give it a go! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_72.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..40f7063ec6f60052b289eb1eaef3789df4a6b1e7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_72.txt @@ -0,0 +1,2 @@ +Inference +Now that you have finetuned a DETR model, evaluated it, and uploaded it to the Hugging Face Hub, you can use it for inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_73.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a44c74dfaf36716f011b13791e886a26aca01e3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_73.txt @@ -0,0 +1 @@ +The simplest way to try out your finetuned model for inference is to use it in a [Pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_object_detection/chunk_74.txt b/chunked/content_aware_chunking/tasks_object_detection/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4072324b502e9976dd2f75e09bed6e2ad6bb9ef --- /dev/null +++ b/chunked/content_aware_chunking/tasks_object_detection/chunk_74.txt @@ -0,0 +1,37 @@ +Instantiate a pipeline +for object detection with your model, and pass an image to it: + +from transformers import pipeline +import requests +url = "https://i.imgur.com/2lnWoly.jpg" +image = Image.open(requests.get(url, stream=True).raw) +obj_detector = pipeline("object-detection", model="devonho/detr-resnet-50_finetuned_cppe5") +obj_detector(image) + +You can also manually replicate the results of the pipeline if you'd like: + +image_processor = AutoImageProcessor.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5") +model = AutoModelForObjectDetection.from_pretrained("devonho/detr-resnet-50_finetuned_cppe5") +with torch.no_grad(): + inputs = image_processor(images=image, return_tensors="pt") + outputs = model(**inputs) + target_sizes = torch.tensor([image.size[::-1]]) + results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[0] +for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + box = [round(i, 2) for i in box.tolist()] + print( + f"Detected {model.config.id2label[label.item()]} with confidence " + f"{round(score.item(), 3)} at location {box}" + ) +Detected Coverall with confidence 0.566 at location [1215.32, 147.38, 4401.81, 3227.08] +Detected Mask with confidence 0.584 at location [2449.06, 823.19, 3256.43, 1413.9] + +Let's plot the result: + +draw = ImageDraw.Draw(image) +for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + box = [round(i, 2) for i in box.tolist()] + x, y, x2, y2 = tuple(box) + draw.rectangle((x, y, x2, y2), outline="red", width=1) + draw.text((x, y), model.config.id2label[label.item()], fill="white") +image \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_100.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7ee78c05076ea4fcbce14e3d6252df9fadc9e1b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_100.txt @@ -0,0 +1,2 @@ +In this case, you can +try the technique called few-shot prompting. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_101.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..939a4f4aa2d3195d1098d82b56791e885da024f3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_101.txt @@ -0,0 +1 @@ +In few-shot prompting, we provide examples in the prompt giving the model more context to improve the performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_102.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa1751f9c551702edbe2da79bfa22404f3ee94c8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_102.txt @@ -0,0 +1 @@ +The examples condition the model to generate the output following the patterns in the examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_103.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..962817afd382350f225fbd254cb6e4be78e7cfc6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_103.txt @@ -0,0 +1,5 @@ +Here's an example: +thon + +torch.manual_seed(0) # doctest: +IGNORE_RESULT +prompt = """Text: The first human went into space and orbited the Earth on April 12, 1961. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_104.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fb6fef511ebd38d5a09431592e6565e30fde8fc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_104.txt @@ -0,0 +1,2 @@ +Date: 04/12/1961 + Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_105.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..652294c7509df568f6b0f7c4f6b6750a67970e90 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_105.txt @@ -0,0 +1,10 @@ +Date:""" +sequences = pipe( + prompt, + max_new_tokens=8, + do_sample=True, + top_k=10, + ) +for seq in sequences: + print(f"Result: {seq['generated_text']}") +Result: Text: The first human went into space and orbited the Earth on April 12, 1961. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_106.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..8af10004d09b912da3b02798414ed4941a3e9f4b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_106.txt @@ -0,0 +1,2 @@ +Date: 04/12/1961 +Text: The first-ever televised presidential debate in the United States took place on September 28, 1960, between presidential candidates John F. Kennedy and Richard Nixon. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_107.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9cb998dcdf07527c0568430268b02a7280521fd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_107.txt @@ -0,0 +1,4 @@ +Date: 09/28/1960 + +In the above code snippet we used a single example to demonstrate the desired output to the model, so this can be called a +"one-shot" prompting. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_108.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..44df16e909bc0cdfb8c0e497751fac72ac851096 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_108.txt @@ -0,0 +1 @@ +However, depending on the task complexity you may need to use more than one example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_109.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..123776a596e3d089670ce58f77618fd334c3ad2b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_109.txt @@ -0,0 +1,3 @@ +Limitations of the few-shot prompting technique: +- While LLMs can pick up on the patterns in the examples, these technique doesn't work well on complex reasoning tasks +- Few-shot prompting requires creating lengthy prompts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_110.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f9b3ee9db7eaf0d2d9716de274da1f8d4364bb4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_110.txt @@ -0,0 +1 @@ +Prompts with large number of tokens can increase computation and latency. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_111.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..6040357aee7ec25f6dabc184a3910eebe10d5dba --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_111.txt @@ -0,0 +1 @@ +There's also a limit to the length of the prompts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_112.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..5dd1332fe3fe1e784f25ed97e2b590e849638e03 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_112.txt @@ -0,0 +1 @@ +- Sometimes when given a number of examples, models can learn patterns that you didn't intend them to learn, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_113.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..9380ff9fb9fe9db396c91fda5418109e59c65bd1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_113.txt @@ -0,0 +1 @@ +that the third movie review is always negative. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_114.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..01f1e83f1d34d36a00ce666e7a3fcaf9acad67c2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_114.txt @@ -0,0 +1,3 @@ +Chain-of-thought +Chain-of-thought (CoT) prompting is a technique that nudges a model to produce intermediate reasoning steps thus improving +the results on complex reasoning tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_115.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d886ae2d0d28e0560e95138c9bab60bb6567637 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_115.txt @@ -0,0 +1,2 @@ +There are two ways of steering a model to producing the reasoning steps: +- few-shot prompting by illustrating examples with detailed answers to questions, showing the model how to work through a problem. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_116.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..3aa0fd5389e1fe4ea892b65f8e306be6878efea6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_116.txt @@ -0,0 +1 @@ +- by instructing the model to reason by adding phrases like "Let's think step by step" or "Take a deep breath and work through the problem step by step." \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_117.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..9de69ba91cd70f027167ca15649c38d4a1e78586 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_117.txt @@ -0,0 +1,6 @@ +If we apply the CoT technique to the muffins example from the reasoning section and use a larger model, +such as (tiiuae/falcon-180B-chat) which you can play with in the HuggingChat, +we'll get a significant improvement on the reasoning result: +text +Let's go through this step-by-step: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_118.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b42961d7a699fa67be944a1ac872724bcb8f84d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_118.txt @@ -0,0 +1 @@ +You start with 15 muffins. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_119.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_119.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_120.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ddecf0880df632228c34862358c95dea0001b99 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_120.txt @@ -0,0 +1 @@ +You eat 2 muffins, leaving you with 13 muffins. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_121.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_121.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_122.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ded4a6fd2ab2c4d2becd97c3bb2407d3b715033 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_122.txt @@ -0,0 +1 @@ +You give 5 muffins to your neighbor, leaving you with 8 muffins. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_123.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd249e29f9fefd5ef6445828f1394f228bd97f1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_123.txt @@ -0,0 +1 @@ +4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_124.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..6887c94a767fef6d46f99ca0a0f645cc6a7d23b0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_124.txt @@ -0,0 +1 @@ +Your partner buys 6 more muffins, bringing the total number of muffins to 14. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_125.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..91dcb6a87071975adc555db222107a0056de804e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_125.txt @@ -0,0 +1 @@ +5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_126.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..7995e1c0078ee9345bef387f9beaa7852acee5f5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_126.txt @@ -0,0 +1 @@ +Your partner eats 2 muffins, leaving you with 12 muffins. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_127.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..294a8ef65759218bdd717340bd59f5f5e40ff2a5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_127.txt @@ -0,0 +1 @@ +Therefore, you now have 12 muffins. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_128.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..86519e3cb0e57192b1d76a0bf71a02628fa4e84f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_128.txt @@ -0,0 +1,3 @@ +Prompting vs fine-tuning +You can achieve great results by optimizing your prompts, however, you may still ponder whether fine-tuning a model +would work better for your case. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_129.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..83a00ea552ad2bcd604a9a4757d3868451712fa9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_129.txt @@ -0,0 +1,3 @@ +Here are some scenarios when fine-tuning a smaller model may be a preferred option: + +Your domain is wildly different from what LLMs were pre-trained on and extensive prompt optimization did not yield sufficient results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_130.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e7c7f6967ab9eacba62f50a94e7df74ffb8447f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_130.txt @@ -0,0 +1 @@ +You need your model to work well in a low-resource language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_131.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac0f9a2555434e62a74e3a555ca5893b7b7d00e5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_131.txt @@ -0,0 +1 @@ +You need the model to be trained on sensitive data that is under strict regulations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_132.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..097ff668b92994054135a293165787f5bb60a048 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_132.txt @@ -0,0 +1 @@ +You have to use a small model due to cost, privacy, infrastructure or other limitations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_133.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e89a818901c322a4784c4bac3b6dbbd2c86eac2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_133.txt @@ -0,0 +1,2 @@ +In all of the above examples, you will need to make sure that you either already have or can easily obtain a large enough +domain-specific dataset at a reasonable cost to fine-tune a model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_134.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..b06c5431a5dd8f6359a9e929389b9f471a9116c3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_134.txt @@ -0,0 +1,2 @@ +You will also need to have enough time and resources +to fine-tune a model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_135.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..2971ae059e6caebca788cea31ec33e44f6fbc994 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_135.txt @@ -0,0 +1 @@ +If the above examples are not the case for you, optimizing prompts can prove to be more beneficial. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_39.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..e94b017797132250ca36817bc16ed8159a07353b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_39.txt @@ -0,0 +1 @@ +Text: This movie is definitely one of my favorite movies of its kind. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_40.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..525ee71ce0218a42f926cdb19f1516c9e26c6c6e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_40.txt @@ -0,0 +1 @@ +The interaction between respectable and morally strong characters is an ode to chivalry and the honor code amongst thieves and policemen. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_41.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..8416793c3a730b2a9f7aaad6ba0a4d0194b30b1e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_41.txt @@ -0,0 +1,4 @@ +Sentiment: +Positive + +As a result, the output contains a classification label from the list we have provided in the instructions, and it is a correct one! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_42.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7a52f825c9f268befba26f6ab45acede9e397dc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_42.txt @@ -0,0 +1 @@ +You may notice that in addition to the prompt, we pass a max_new_tokens parameter. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_43.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d2bafcae89413fcab887e53b08555808ed0e091 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_43.txt @@ -0,0 +1,3 @@ +It controls the number of tokens the +model shall generate, and it is one of the many text generation parameters that you can learn about +in Text generation strategies guide. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_44.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..872b9b5c3f5c1efef382fdc62f37497911204724 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_44.txt @@ -0,0 +1,2 @@ +Named Entity Recognition +Named Entity Recognition (NER) is a task of finding named entities in a piece of text, such as a person, location, or organization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_45.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e6656853b2e8d23dac52e9da2d9fac2050a2520 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_45.txt @@ -0,0 +1 @@ +Let's modify the instructions in the prompt to make the LLM perform this task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_46.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d86349df323669765c2570613272f724115de52 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_46.txt @@ -0,0 +1,6 @@ +Here, let's also set return_full_text = False +so that output doesn't contain the prompt: +thon + +torch.manual_seed(1) # doctest: +IGNORE_RESULT +prompt = """Return a list of named entities in the text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_47.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..76f727cbe89dd2106c8056ba608216aae3ef6711 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_47.txt @@ -0,0 +1 @@ +Text: The Golden State Warriors are an American professional basketball team based in San Francisco. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_48.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..99b86caf1140e992945bb61ee1f35ff5486afc74 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_48.txt @@ -0,0 +1,13 @@ +Named entities: + """ +sequences = pipe( + prompt, + max_new_tokens=15, + return_full_text = False, + ) +for seq in sequences: + print(f"{seq['generated_text']}") +- Golden State Warriors +- San Francisco + +As you can see, the model correctly identified two named entities from the given text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_49.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..a23ce73878f94dea25d60cf9de7b672933136ded --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_49.txt @@ -0,0 +1,2 @@ +Translation +Another task LLMs can perform is translation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_50.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..208aeecfcab0e750deee41f688df95e20cfbcf25 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_50.txt @@ -0,0 +1,2 @@ +You can choose to use encoder-decoder models for this task, however, here, +for the simplicity of the examples, we'll keep using Falcon-7b-instruct, which does a decent job. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_51.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7d55e3ded131b263e92378bf62eccffd4628c1a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_51.txt @@ -0,0 +1,6 @@ +Once again, here's how +you can write a basic prompt to instruct a model to translate a piece of text from English to Italian: +thon + +torch.manual_seed(2) # doctest: +IGNORE_RESULT +prompt = """Translate the English text to Italian. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_52.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc87e36114111cf6b7be146cde34d338c8876ebf --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_52.txt @@ -0,0 +1 @@ +Text: Sometimes, I've believed as many as six impossible things before breakfast. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_53.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e299a1eedc7c06274b964e3b7af228c60f3e270 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_53.txt @@ -0,0 +1,12 @@ +Translation: + """ +sequences = pipe( + prompt, + max_new_tokens=20, + do_sample=True, + top_k=10, + return_full_text = False, + ) +for seq in sequences: + print(f"{seq['generated_text']}") +A volte, ho creduto a sei impossibili cose prima di colazione. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_54.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b11d6659a414eb02e182b4f35ecacfc66752fd9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_54.txt @@ -0,0 +1 @@ +Here we've added a do_sample=True and top_k=10 to allow the model to be a bit more flexible when generating output. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_55.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..eec726165b2d9a4de9ad403552a909ed8f4cba42 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_55.txt @@ -0,0 +1,3 @@ +Text summarization +Similar to the translation, text summarization is another generative task where the output heavily relies on the input, +and encoder-decoder models can be a better choice. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_56.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..0adb48e72f2939e934a897c006e4de034a21285e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_56.txt @@ -0,0 +1 @@ +However, decoder-style models can be used for this task as well. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_57.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..82384aab23e1ba410cf1f27abff8f92a7ba184c4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_57.txt @@ -0,0 +1 @@ +Previously, we have placed the instructions at the very beginning of the prompt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_58.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6b02b19b14645e5e4fd38258b435acfd8585002 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_58.txt @@ -0,0 +1,2 @@ +However, the very end of the prompt can +also be a suitable location for instructions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_59.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..41e70d59ab17c75dfaaee78646e5de07351aee0c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_59.txt @@ -0,0 +1 @@ +Typically, it's better to place the instruction on one of the extreme ends. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_60.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bcd470858ead05337f04b9f98d9c152e610ec0e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_60.txt @@ -0,0 +1,4 @@ +thon + +torch.manual_seed(3) # doctest: +IGNORE_RESULT +prompt = """Permaculture is a design process mimicking the diversity, functionality and resilience of natural ecosystems. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_61.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..997719b4a2543da04cc5f5a93b1ae84398b0b9e5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_61.txt @@ -0,0 +1 @@ +The principles and practices are drawn from traditional ecological knowledge of indigenous cultures combined with modern scientific understanding and technological innovations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_62.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7539de6e4c5789ff4c9729c9a6c5d24e2c33656 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_62.txt @@ -0,0 +1 @@ +Permaculture design provides a framework helping individuals and communities develop innovative, creative and effective strategies for meeting basic needs while preparing for and mitigating the projected impacts of climate change. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_63.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..985ec27c5aba778453674225eaf7debccf7b77ac --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_63.txt @@ -0,0 +1 @@ +Write a summary of the above text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_64.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..458c621b5e85461dfbbedce3332ebb0be1164c5a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_64.txt @@ -0,0 +1,12 @@ +Summary: + """ +sequences = pipe( + prompt, + max_new_tokens=30, + do_sample=True, + top_k=10, + return_full_text = False, + ) +for seq in sequences: + print(f"{seq['generated_text']}") +Permaculture is an ecological design mimicking natural ecosystems to meet basic needs and prepare for climate change. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_65.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..16186e20034575ee8433485b53646cde3af2e7d0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_65.txt @@ -0,0 +1 @@ +It is based on traditional knowledge and scientific understanding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_66.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e564c7a89f039ea2c3a5e1469cc56617c07a218 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_66.txt @@ -0,0 +1,7 @@ +Question answering +For question answering task we can structure the prompt into the following logical components: instructions, context, question, and +the leading word or phrase ("Answer:") to nudge the model to start generating the answer: +thon + +torch.manual_seed(4) # doctest: +IGNORE_RESULT +prompt = """Answer the question using the context below. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_67.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..e78a81a722bff085d7d09bb5b5d91fbba20ad1f0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_67.txt @@ -0,0 +1 @@ +Context: Gazpacho is a cold soup and drink made of raw, blended vegetables. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_68.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff3a774c41c7130bef9becec7a71f499674df235 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_68.txt @@ -0,0 +1 @@ +Most gazpacho includes stale bread, tomato, cucumbers, onion, bell peppers, garlic, olive oil, wine vinegar, water, and salt. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_69.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..52f0a5737faccacd2cf8cdb1a864232e6bc1de5f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_69.txt @@ -0,0 +1 @@ +Northern recipes often include cumin and/or pimentón (smoked sweet paprika). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_70.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f781990bca06c91d6e2a0b1ff1d7075bb9e9a67 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_70.txt @@ -0,0 +1 @@ +Traditionally, gazpacho was made by pounding the vegetables in a mortar with a pestle; this more laborious method is still sometimes used as it helps keep the gazpacho cool and avoids the foam and silky consistency of smoothie versions made in blenders or food processors. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_71.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3abd20f2e22206779303e362b582f07c6e16239 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_71.txt @@ -0,0 +1 @@ +Question: What modern tool is used to make gazpacho? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_72.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..d893c21c9c407a7dacf096ed5924896917f834ee --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_72.txt @@ -0,0 +1,16 @@ +Answer: + """ +sequences = pipe( + prompt, + max_new_tokens=10, + do_sample=True, + top_k=10, + return_full_text = False, + ) +for seq in sequences: + print(f"Result: {seq['generated_text']}") +Result: Modern tools are used, such as immersion blenders + +Reasoning +Reasoning is one of the most difficult tasks for LLMs, and achieving good results often requires applying advanced prompting techniques, like +Chain-of-though. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_73.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..da8a426ec442c05dc66221553d82d4ccb2a368f3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_73.txt @@ -0,0 +1,5 @@ +Let's try if we can make a model reason about a simple arithmetics task with a basic prompt: +thon + +torch.manual_seed(5) # doctest: +IGNORE_RESULT +prompt = """There are 5 groups of students in the class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_74.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..79a151eb9b72208ea4f3431c4f59e7c81b0c53a8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_74.txt @@ -0,0 +1 @@ +Each group has 4 students. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_75.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..5baaaaaadd8f878c225ce091728d4405c30c0029 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_75.txt @@ -0,0 +1 @@ +How many students are there in the class?""" \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_76.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..c725c508e355d5cd02902f8a5d4ab98f63d16dcc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_76.txt @@ -0,0 +1,11 @@ +sequences = pipe( + prompt, + max_new_tokens=30, + do_sample=True, + top_k=10, + return_full_text = False, + ) +for seq in sequences: + print(f"Result: {seq['generated_text']}") +Result: +There are a total of 5 groups, so there are 5 x 4=20 students in the class. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_77.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..249085ff31bd9469f7f6640f42bdf3e455516e5f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_77.txt @@ -0,0 +1 @@ +Correct! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_78.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1856be72f359800fcc5c5d9870ae5046cc3f60d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_78.txt @@ -0,0 +1,5 @@ +Let's increase the complexity a little and see if we can still get away with a basic prompt: +thon + +torch.manual_seed(6) # doctest: +IGNORE_RESULT +prompt = """I baked 15 muffins. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_79.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..d91d429570bd3ad089ec9b167e1a26b33a10b470 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_79.txt @@ -0,0 +1 @@ +I ate 2 muffins and gave 5 muffins to a neighbor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_80.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..70b8a62a78d67569b1aa9c9981afb71f6593d23c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_80.txt @@ -0,0 +1 @@ +My partner then bought 6 more muffins and ate 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_81.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b12d860cf5228ccf3a8ba6fc60fbf698a93f2b6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_81.txt @@ -0,0 +1 @@ +How many muffins do we now have?""" \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_82.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..40ad2aa0e935916c6f163df008ef2086e7ed1ef3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_82.txt @@ -0,0 +1,13 @@ +sequences = pipe( + prompt, + max_new_tokens=10, + do_sample=True, + top_k=10, + return_full_text = False, + ) +for seq in sequences: + print(f"Result: {seq['generated_text']}") +Result: +The total number of muffins now is 21 + +This is a wrong answer, it should be 12. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_83.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f4765732c54b98b0cd7b294dbd35e4d6e721781 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_83.txt @@ -0,0 +1,2 @@ +In this case, this can be due to the prompt being too basic, or due to the choice +of model, after all we've picked the smallest version of Falcon. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_84.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..70d798dcb792806ca56171b4db089ee0af08a105 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_84.txt @@ -0,0 +1,2 @@ +Reasoning is difficult for models of all sizes, but larger +models are likely to perform better. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_85.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..596c631b52ea884f666ecdb329968ee2011ef5ba --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_85.txt @@ -0,0 +1,4 @@ +Best practices of LLM prompting +In this section of the guide we have compiled a list of best practices that tend to improve the prompt results: + +When choosing the model to work with, the latest and most capable models are likely to perform better. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_86.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..734aae87730914174134c7e59ff6a4944a500e36 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_86.txt @@ -0,0 +1 @@ +Start with a simple and short prompt, and iterate from there. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_87.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..b46dafa192d9a5b76edbeda382d40ce5ebde3ca8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_87.txt @@ -0,0 +1 @@ +Put the instructions at the beginning of the prompt, or at the very end. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_88.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..2eb92d1254de0ea3617ccad066370f729bd2ae38 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_88.txt @@ -0,0 +1 @@ +When working with large context, models apply various optimizations to prevent Attention complexity from scaling quadratically. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_89.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..c412aeda18223a61cf8525200e555fe7986b7a77 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_89.txt @@ -0,0 +1 @@ +This may make a model more attentive to the beginning or end of a prompt than the middle. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_90.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2a7cb9f9e1205883ceab085cf2bfe02040307ba --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_90.txt @@ -0,0 +1 @@ +Clearly separate instructions from the text they apply to - more on this in the next section. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_91.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..c02f92e9de31168e1a9140b54925a592366d3579 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_91.txt @@ -0,0 +1 @@ +Be specific and descriptive about the task and the desired outcome - its format, length, style, language, etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_92.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..5861edcac49d34da17176274891de6d329ad70c2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_92.txt @@ -0,0 +1 @@ +Avoid ambiguous descriptions and instructions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_93.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..37f3844700995d0192166913f12e6aaf29d76da7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_93.txt @@ -0,0 +1 @@ +Favor instructions that say "what to do" instead of those that say "what not to do". \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_94.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..49924a025dffa79df919626417ebf16ef6ac10da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_94.txt @@ -0,0 +1 @@ +"Lead" the output in the right direction by writing the first word (or even begin the first sentence for the model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_95.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cc3c4a1042bef6ca17450e358c7a116808b8405 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_95.txt @@ -0,0 +1,2 @@ +Use advanced techniques like Few-shot prompting and Chain-of-thought +Test your prompts with different models to assess their robustness. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_96.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d1edaa5524e9870f512059aa3b3019b1e9b5739 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_96.txt @@ -0,0 +1 @@ +Version and track the performance of your prompts. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_97.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..d497d61498d32522f2fbbd95eb7292808b9933d8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_97.txt @@ -0,0 +1,4 @@ +Advanced prompting techniques +Few-shot prompting +The basic prompts in the sections above are the examples of "zero-shot" prompts, meaning, the model has been given +instructions and context, but no examples with solutions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_98.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..95cf5568de9677169d775a2515a8722c78c0fe01 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_98.txt @@ -0,0 +1,2 @@ +LLMs that have been fine-tuned on instruction datasets, generally +perform well on such "zero-shot" tasks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_prompting/chunk_99.txt b/chunked/content_aware_chunking/tasks_prompting/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..28b6d4b3103b568ed003c5cd2d527d954d8169ff --- /dev/null +++ b/chunked/content_aware_chunking/tasks_prompting/chunk_99.txt @@ -0,0 +1,2 @@ +However, you may find that your task has more complexity or nuance, and, perhaps, +you have some requirements for the output that the model doesn't catch on just from the instructions. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_14.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..ceafac0e394f9222033cd7741d823cb3907265fe --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_14.txt @@ -0,0 +1,10 @@ +Preprocess + +The next step is to load a DistilBERT tokenizer to process the question and context fields: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") + +There are a few preprocessing steps particular to question answering tasks you should be aware of: + +Some examples in a dataset may have a very long context that exceeds the maximum input length of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_15.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5a513c8269b723092491eb1302472d36c6bc1e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_15.txt @@ -0,0 +1 @@ +To deal with longer sequences, truncate only the context by setting truncation="only_second". \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_16.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..67ff4075a6565f73ce911bfe600e238c9057b6eb --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_16.txt @@ -0,0 +1,2 @@ +Next, map the start and end positions of the answer to the original context by setting + return_offset_mapping=True. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_17.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..57fac083ec2971f7d8b4c630b50f4dc9fde2076d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_17.txt @@ -0,0 +1 @@ +With the mapping in hand, now you can find the start and end tokens of the answer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_18.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..178f9a76c5315b3137222fc6b387cfff50f71287 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_18.txt @@ -0,0 +1,2 @@ +Use the [~tokenizers.Encoding.sequence_ids] method to + find which part of the offset corresponds to the question and which corresponds to the context. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_19.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7ecf663763bfd7e341906dc5d6525367d916b82 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_19.txt @@ -0,0 +1,49 @@ +Here is how you can create a function to truncate and map the start and end tokens of the answer to the context: + +def preprocess_function(examples): + questions = [q.strip() for q in examples["question"]] + inputs = tokenizer( + questions, + examples["context"], + max_length=384, + truncation="only_second", + return_offsets_mapping=True, + padding="max_length", + ) + + offset_mapping = inputs.pop("offset_mapping") + answers = examples["answers"] + start_positions = [] + end_positions = [] + for i, offset in enumerate(offset_mapping): + answer = answers[i] + start_char = answer["answer_start"][0] + end_char = answer["answer_start"][0] + len(answer["text"][0]) + sequence_ids = inputs.sequence_ids(i) + # Find the start and end of the context + idx = 0 + while sequence_ids[idx] != 1: + idx += 1 + context_start = idx + while sequence_ids[idx] == 1: + idx += 1 + context_end = idx - 1 + # If the answer is not fully inside the context, label it (0, 0) + if offset[context_start][0] > end_char or offset[context_end][1] < start_char: + start_positions.append(0) + end_positions.append(0) + else: + # Otherwise it's the start and end token positions + idx = context_start + while idx <= context_end and offset[idx][0] <= start_char: + idx += 1 + start_positions.append(idx - 1) + idx = context_end + while idx >= context_start and offset[idx][1] >= end_char: + idx -= 1 + end_positions.append(idx + 1) + inputs["start_positions"] = start_positions + inputs["end_positions"] = end_positions + return inputs + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [~datasets.Dataset.map] function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_20.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..79578d72a5fd9a3d2fb1624cee9eb2eac9007dee --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_20.txt @@ -0,0 +1 @@ +You can speed up the map function by setting batched=True to process multiple elements of the dataset at once. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_21.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..34e2de7990a9d851785f7a73d569ca9fa240b361 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_21.txt @@ -0,0 +1,5 @@ +Remove any columns you don't need: + +tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names) + +Now create a batch of examples using [DefaultDataCollator]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_22.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..53c604965305807f37803929738ba46c328b5d81 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_22.txt @@ -0,0 +1 @@ +Unlike other data collators in 🤗 Transformers, the [DefaultDataCollator] does not apply any additional preprocessing such as padding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_23.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..1228884fa0698b4d0a95a3e81660c0732041c190 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_23.txt @@ -0,0 +1,10 @@ +from transformers import DefaultDataCollator +data_collator = DefaultDataCollator() + +py +from transformers import DefaultDataCollator +data_collator = DefaultDataCollator(return_tensors="tf") + +Train + +If you aren't familiar with finetuning a model with the [Trainer], take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_24.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..b606df7180c2e2be472d63ede6d17d92a9485791 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_24.txt @@ -0,0 +1 @@ +You're ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_25.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..a784b57af1e6e75b330c983033891be4996695a1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_25.txt @@ -0,0 +1,8 @@ +Load DistilBERT with [AutoModelForQuestionAnswering]: + +from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer +model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") + +At this point, only three steps remain: + +Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_26.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..be198e14862c0c12a30c53e206e6011dc146c468 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_26.txt @@ -0,0 +1 @@ +The only required parameter is output_dir which specifies where to save your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_27.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_27.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_28.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3e0df8213d2be3d2d5ea724ff4f24f64040a4ae --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_28.txt @@ -0,0 +1 @@ +Pass the training arguments to [Trainer] along with the model, dataset, tokenizer, and data collator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_29.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_29.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_30.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5d211cb3473545d63a5c13987d11c074dfe2421 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_30.txt @@ -0,0 +1,25 @@ +training_args = TrainingArguments( + output_dir="my_awesome_qa_model", + evaluation_strategy="epoch", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=3, + weight_decay=0.01, + push_to_hub=True, + ) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_squad["train"], + eval_dataset=tokenized_squad["test"], + tokenizer=tokenizer, + data_collator=data_collator, + ) +trainer.train() + +Once training is completed, share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_31.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2d4e21ebc181cd170cb41c1ce290da2250d71f9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_31.txt @@ -0,0 +1,38 @@ +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +from transformers import create_optimizer +batch_size = 16 +num_epochs = 2 +total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs +optimizer, schedule = create_optimizer( + init_lr=2e-5, + num_warmup_steps=0, + num_train_steps=total_train_steps, + ) + +Then you can load DistilBERT with [TFAutoModelForQuestionAnswering]: + +from transformers import TFAutoModelForQuestionAnswering +model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") + +Convert your datasets to the tf.data.Dataset format with [~transformers.TFPreTrainedModel.prepare_tf_dataset]: + +tf_train_set = model.prepare_tf_dataset( + tokenized_squad["train"], + shuffle=True, + batch_size=16, + collate_fn=data_collator, + ) +tf_validation_set = model.prepare_tf_dataset( + tokenized_squad["test"], + shuffle=False, + batch_size=16, + collate_fn=data_collator, + ) + +Configure the model for training with compile: + +import tensorflow as tf +model.compile(optimizer=optimizer) + +The last thing to setup before you start training is to provide a way to push your model to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_32.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfe3493ab92e73176435767ea3c819933884c8d5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_32.txt @@ -0,0 +1,9 @@ +This can be done by specifying where to push your model and tokenizer in the [~transformers.PushToHubCallback]: + +from transformers.keras_callbacks import PushToHubCallback +callback = PushToHubCallback( + output_dir="my_awesome_qa_model", + tokenizer=tokenizer, + ) + +Finally, you're ready to start training your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_33.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c899f0bd8892b8d9cc6df86d3fb6625a0832852 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_33.txt @@ -0,0 +1,5 @@ +Call fit with your training and validation datasets, the number of epochs, and your callback to finetune the model: + +model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback]) + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_34.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..b55e953a1c9610ea7baa29039ce36c9e29450fa3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_34.txt @@ -0,0 +1,3 @@ +For a more in-depth example of how to finetune a model for question answering, take a look at the corresponding +PyTorch notebook +or TensorFlow notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_35.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..10415c56f185168104c9247f2b7017283dbe716c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_35.txt @@ -0,0 +1,2 @@ +Evaluate +Evaluation for question answering requires a significant amount of postprocessing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_36.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..126ee8bf1dd47f877e1cad61b9fd384ceb923ac3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_36.txt @@ -0,0 +1 @@ +To avoid taking up too much of your time, this guide skips the evaluation step. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_37.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b269eda7c97aa96600e2f96f58dea02e468b544 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_37.txt @@ -0,0 +1 @@ +The [Trainer] still calculates the evaluation loss during training so you're not completely in the dark about your model's performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_38.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2957b9cf21ed179accca5726d9d86420e844587 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_38.txt @@ -0,0 +1 @@ +If have more time and you're interested in how to evaluate your model for question answering, take a look at the Question answering chapter from the 🤗 Hugging Face Course! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_39.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5ec9281024b79527faf7b28b94415e6100aee5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_39.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've finetuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_40.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3fd0fde1b231026b52124f47a250c62e9543fe4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_40.txt @@ -0,0 +1,3 @@ +Come up with a question and some context you'd like the model to predict: + +question = "How many programming languages does BLOOM support?" \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_41.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..760464ada4b98de553f2c7688d703ce3e5495f41 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_41.txt @@ -0,0 +1 @@ +context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages." \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_42.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c6dead06229ef8e450c4bcfad5f707e61359e29 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_42.txt @@ -0,0 +1 @@ +The simplest way to try out your finetuned model for inference is to use it in a [pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_question_answering/chunk_43.txt b/chunked/content_aware_chunking/tasks_question_answering/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..478627db16eb90244ca4ab7b635994ccc7fd4620 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_question_answering/chunk_43.txt @@ -0,0 +1,59 @@ +Instantiate a pipeline for question answering with your model, and pass your text to it: + +from transformers import pipeline +question_answerer = pipeline("question-answering", model="my_awesome_qa_model") +question_answerer(question=question, context=context) +{'score': 0.2058267742395401, + 'start': 10, + 'end': 95, + 'answer': '176 billion parameters and can generate text in 46 languages natural languages and 13'} + +You can also manually replicate the results of the pipeline if you'd like: + +Tokenize the text and return PyTorch tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model") +inputs = tokenizer(question, context, return_tensors="pt") + +Pass your inputs to the model and return the logits: + +import torch +from transformers import AutoModelForQuestionAnswering +model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model") +with torch.no_grad(): + outputs = model(**inputs) + +Get the highest probability from the model output for the start and end positions: + +answer_start_index = outputs.start_logits.argmax() +answer_end_index = outputs.end_logits.argmax() + +Decode the predicted tokens to get the answer: + +predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] +tokenizer.decode(predict_answer_tokens) +'176 billion parameters and can generate text in 46 languages natural languages and 13' + +Tokenize the text and return TensorFlow tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model") +inputs = tokenizer(question, text, return_tensors="tf") + +Pass your inputs to the model and return the logits: + +from transformers import TFAutoModelForQuestionAnswering +model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model") +outputs = model(**inputs) + +Get the highest probability from the model output for the start and end positions: + +answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0]) +answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0]) + +Decode the predicted tokens to get the answer: + +predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1] +tokenizer.decode(predict_answer_tokens) +'176 billion parameters and can generate text in 46 languages natural languages and 13' \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_23.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..10fdc0883268f7702e92c808090194392d02da69 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_23.txt @@ -0,0 +1 @@ +ImageProcessor also takes care of resizing and normalizing the images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_24.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..e78fc6fa86e00c2cda565e1aecefbd0db41d78a6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_24.txt @@ -0,0 +1,12 @@ +def train_transforms(example_batch): + images = [aug_transforms(x.convert("RGB")) for x in example_batch["image"]] + labels = [x for x in example_batch["annotation"]] + inputs = image_processor(images, labels) + return inputs +def val_transforms(example_batch): + images = [transforms(x.convert("RGB")) for x in example_batch["image"]] + labels = [x for x in example_batch["annotation"]] + inputs = image_processor(images, labels) + return inputs + +To apply the preprocessing transformations over the entire dataset, use the 🤗 Datasets [~datasets.Dataset.set_transform] function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_25.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..b45737a9f340b488697c2ed0973201b57188cffa --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_25.txt @@ -0,0 +1,7 @@ +The transform is applied on the fly which is faster and consumes less disk space: + +train_ds.set_transform(train_transforms) +test_ds.set_transform(val_transforms) + +Evaluate +Including a metric during training is often helpful for evaluating your model's performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_26.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fc865af3d53caab0fd532b9d14bef3404dce2b5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_26.txt @@ -0,0 +1 @@ +You can quickly load an evaluation method with the 🤗 Evaluate library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_27.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d65cb7cc70462b97067e67442151b69d8be53385 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_27.txt @@ -0,0 +1,6 @@ +For this task, load the mean Intersection over Union (IoU) metric (see the 🤗 Evaluate quick tour to learn more about how to load and compute a metric): + +import evaluate +metric = evaluate.load("mean_iou") + +Then create a function to [~evaluate.EvaluationModule.compute] the metrics. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_28.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c41c42a2964c2ddfb3384227d302e74c22e92ce --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_28.txt @@ -0,0 +1,54 @@ +Your predictions need to be converted to +logits first, and then reshaped to match the size of the labels before you can call [~evaluate.EvaluationModule.compute]: + +import numpy as np +import torch +from torch import nn +def compute_metrics(eval_pred): + with torch.no_grad(): + logits, labels = eval_pred + logits_tensor = torch.from_numpy(logits) + logits_tensor = nn.functional.interpolate( + logits_tensor, + size=labels.shape[-2:], + mode="bilinear", + align_corners=False, + ).argmax(dim=1) + + pred_labels = logits_tensor.detach().cpu().numpy() + metrics = metric.compute( + predictions=pred_labels, + references=labels, + num_labels=num_labels, + ignore_index=255, + reduce_labels=False, + ) + for key, value in metrics.items(): + if isinstance(value, np.ndarray): + metrics[key] = value.tolist() + return metrics + +def compute_metrics(eval_pred): + logits, labels = eval_pred + logits = tf.transpose(logits, perm=[0, 2, 3, 1]) + logits_resized = tf.image.resize( + logits, + size=tf.shape(labels)[1:], + method="bilinear", + ) + + pred_labels = tf.argmax(logits_resized, axis=-1) + metrics = metric.compute( + predictions=pred_labels, + references=labels, + num_labels=num_labels, + ignore_index=-1, + reduce_labels=image_processor.do_reduce_labels, + ) + per_category_accuracy = metrics.pop("per_category_accuracy").tolist() + per_category_iou = metrics.pop("per_category_iou").tolist() + metrics.update({f"accuracy_{id2label[i]}": v for i, v in enumerate(per_category_accuracy)}) + metrics.update({f"iou_{id2label[i]}": v for i, v in enumerate(per_category_iou)}) + return {"val_" + k: v for k, v in metrics.items()} + +Your compute_metrics function is ready to go now, and you'll return to it when you setup your training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_29.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f493faa6fd8490ca5e6198c99a80a5e6e236cf1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_29.txt @@ -0,0 +1,3 @@ +Train + +If you aren't familiar with finetuning a model with the [Trainer], take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_30.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..b606df7180c2e2be472d63ede6d17d92a9485791 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_30.txt @@ -0,0 +1 @@ +You're ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_31.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..36288c1c5394be4711a33ba433a29a6e03b61f47 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_31.txt @@ -0,0 +1,8 @@ +Load SegFormer with [AutoModelForSemanticSegmentation], and pass the model the mapping between label ids and label classes: + +from transformers import AutoModelForSemanticSegmentation, TrainingArguments, Trainer +model = AutoModelForSemanticSegmentation.from_pretrained(checkpoint, id2label=id2label, label2id=label2id) + +At this point, only three steps remain: + +Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_32.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..24eabbc1bfc84472519872be8868cdd20f047315 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_32.txt @@ -0,0 +1 @@ +It is important you don't remove unused columns because this'll drop the image column. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_33.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..e341b4d05beb0aeabb3c2b93ba2d643d64906272 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_33.txt @@ -0,0 +1 @@ +Without the image column, you can't create pixel_values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_34.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..841530cb0c0b51477d1c178cac331ec56c22c80a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_34.txt @@ -0,0 +1 @@ +Set remove_unused_columns=False to prevent this behavior! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_35.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c1ad0b6b2c9b9d6e78a522b09d9d023e6c3842d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_35.txt @@ -0,0 +1 @@ +The only other required parameter is output_dir which specifies where to save your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_36.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_36.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_37.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..0aa9eb47dd15ce5191620aaae7064a6d61cff968 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_37.txt @@ -0,0 +1 @@ +At the end of each epoch, the [Trainer] will evaluate the IoU metric and save the training checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_38.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..643c2a1221f0298c2d0da3f12550f92d36f0b4a3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_38.txt @@ -0,0 +1 @@ +Pass the training arguments to [Trainer] along with the model, dataset, tokenizer, data collator, and compute_metrics function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_39.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_39.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_40.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b015548fbc2ecafee70f1570d12dbd74dd837cc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_40.txt @@ -0,0 +1,30 @@ +training_args = TrainingArguments( + output_dir="segformer-b0-scene-parse-150", + learning_rate=6e-5, + num_train_epochs=50, + per_device_train_batch_size=2, + per_device_eval_batch_size=2, + save_total_limit=3, + evaluation_strategy="steps", + save_strategy="steps", + save_steps=20, + eval_steps=20, + logging_steps=1, + eval_accumulation_steps=5, + remove_unused_columns=False, + push_to_hub=True, + ) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_ds, + eval_dataset=test_ds, + compute_metrics=compute_metrics, + ) +trainer.train() + +Once training is completed, share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +If you are unfamiliar with fine-tuning a model with Keras, check out the basic tutorial first! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_41.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d13aec409721c5e7024766b2ac948287e12ae57 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_41.txt @@ -0,0 +1,2 @@ +To fine-tune a model in TensorFlow, follow these steps: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_42.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..6641a1f2b655a4b38d7e344f5abc14eb31d665e2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_42.txt @@ -0,0 +1 @@ +Define the training hyperparameters, and set up an optimizer and a learning rate schedule. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_43.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_43.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_44.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1ef61b600500320bfc5ed0701db937221f71591 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_44.txt @@ -0,0 +1 @@ +Instantiate a pretrained model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_45.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_45.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_46.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..87bb1fd7a7425780f38c7b9dcf7df5d17d561eb4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_46.txt @@ -0,0 +1 @@ +Convert a 🤗 Dataset to a tf.data.Dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_47.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd249e29f9fefd5ef6445828f1394f228bd97f1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_47.txt @@ -0,0 +1 @@ +4. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_48.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff34899a31366c166b93096a5c69a295c12ef1bc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_48.txt @@ -0,0 +1 @@ +Compile your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_49.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..91dcb6a87071975adc555db222107a0056de804e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_49.txt @@ -0,0 +1 @@ +5. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_50.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc86a9a5ac19c72e8ad848103e023f5a0ef710b7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_50.txt @@ -0,0 +1,2 @@ +Add callbacks to calculate metrics and upload your model to 🤗 Hub +6. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_51.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba02afcaec0a9ed78112bf81aa7f33756d4bdd5e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_51.txt @@ -0,0 +1 @@ +Use the fit() method to run the training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_52.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..749e38aee196bd226f725229a0ebbb038c6c82cd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_52.txt @@ -0,0 +1,17 @@ +Start by defining the hyperparameters, optimizer and learning rate schedule: + +from transformers import create_optimizer +batch_size = 2 +num_epochs = 50 +num_train_steps = len(train_ds) * num_epochs +learning_rate = 6e-5 +weight_decay_rate = 0.01 +optimizer, lr_schedule = create_optimizer( + init_lr=learning_rate, + num_train_steps=num_train_steps, + weight_decay_rate=weight_decay_rate, + num_warmup_steps=0, + ) + +Then, load SegFormer with [TFAutoModelForSemanticSegmentation] along with the label mappings, and compile it with the +optimizer. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_53.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f1e5ab950e5ca271e613acf636f64aedf3f1dce --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_53.txt @@ -0,0 +1,9 @@ +Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +from transformers import TFAutoModelForSemanticSegmentation +model = TFAutoModelForSemanticSegmentation.from_pretrained( + checkpoint, + id2label=id2label, + label2id=label2id, + ) +model.compile(optimizer=optimizer) # No loss argument! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_54.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b27cad539f273cb877592837214defb5a76a3a8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_54.txt @@ -0,0 +1,18 @@ +Convert your datasets to the tf.data.Dataset format using the [~datasets.Dataset.to_tf_dataset] and the [DefaultDataCollator]: + +from transformers import DefaultDataCollator +data_collator = DefaultDataCollator(return_tensors="tf") +tf_train_dataset = train_ds.to_tf_dataset( + columns=["pixel_values", "label"], + shuffle=True, + batch_size=batch_size, + collate_fn=data_collator, + ) +tf_eval_dataset = test_ds.to_tf_dataset( + columns=["pixel_values", "label"], + shuffle=True, + batch_size=batch_size, + collate_fn=data_collator, + ) + +To compute the accuracy from the predictions and push your model to the 🤗 Hub, use Keras callbacks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_55.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..4224c51a566630502bbf0a9cfae2d69037152437 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_55.txt @@ -0,0 +1,11 @@ +Pass your compute_metrics function to [KerasMetricCallback], +and use the [PushToHubCallback] to upload the model: + +from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback +metric_callback = KerasMetricCallback( + metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"] + ) +push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor) +callbacks = [metric_callback, push_to_hub_callback] + +Finally, you are ready to train your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_56.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..e226db3499fed18323309b86ab571f08f94c6d7f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_56.txt @@ -0,0 +1,11 @@ +Call fit() with your training and validation datasets, the number of epochs, +and your callbacks to fine-tune the model: + +model.fit( + tf_train_dataset, + validation_data=tf_eval_dataset, + callbacks=callbacks, + epochs=num_epochs, + ) + +Congratulations! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_57.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..d47f333d728e4db8acd5b419bce2bc1b47ee091a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_57.txt @@ -0,0 +1 @@ +You have fine-tuned your model and shared it on the 🤗 Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_58.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..393d90e98de078560c7b3125197696b7b429d858 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_58.txt @@ -0,0 +1 @@ +You can now use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_59.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5ec9281024b79527faf7b28b94415e6100aee5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_59.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've finetuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_60.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..1520c21744e4a24c24f0e672b0f3b898747a1967 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_60.txt @@ -0,0 +1,6 @@ +Load an image for inference: + +image = ds[0]["image"] +image + +We will now see how to infer without a pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_61.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a3a45449bf7c511449cd1bee0c3b95c06b221bc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_61.txt @@ -0,0 +1,39 @@ +Process the image with an image processor and place the pixel_values on a GPU: + +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use GPU if available, otherwise use a CPU +encoding = image_processor(image, return_tensors="pt") +pixel_values = encoding.pixel_values.to(device) + +Pass your input to the model and return the logits: + +outputs = model(pixel_values=pixel_values) +logits = outputs.logits.cpu() + +Next, rescale the logits to the original image size: + +upsampled_logits = nn.functional.interpolate( + logits, + size=image.size[::-1], + mode="bilinear", + align_corners=False, + ) +pred_seg = upsampled_logits.argmax(dim=1)[0] + +Load an image processor to preprocess the image and return the input as TensorFlow tensors: + +from transformers import AutoImageProcessor +image_processor = AutoImageProcessor.from_pretrained("MariaK/scene_segmentation") +inputs = image_processor(image, return_tensors="tf") + +Pass your input to the model and return the logits: + +from transformers import TFAutoModelForSemanticSegmentation +model = TFAutoModelForSemanticSegmentation.from_pretrained("MariaK/scene_segmentation") +logits = model(**inputs).logits + +Next, rescale the logits to the original image size and apply argmax on the class dimension: + +logits = tf.transpose(logits, [0, 2, 3, 1]) +upsampled_logits = tf.image.resize( + logits, + # We reverse the shape of image because image.size returns width and height. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_62.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4c4cc033351eb34cc83ed6dd789bf13873844c9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_62.txt @@ -0,0 +1,5 @@ +image.size[::-1], + ) +pred_seg = tf.math.argmax(upsampled_logits, axis=-1)[0] + +To visualize the results, load the dataset color palette as ade_palette() that maps each class to their RGB values. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_63.txt b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..2031bd0a6540eb69a0c46da5747b7ab852561497 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_semantic_segmentation/chunk_63.txt @@ -0,0 +1,14 @@ +Then you can combine and plot your image and the predicted segmentation map: + +import matplotlib.pyplot as plt +import numpy as np +color_seg = np.zeros((pred_seg.shape[0], pred_seg.shape[1], 3), dtype=np.uint8) +palette = np.array(ade_palette()) +for label, color in enumerate(palette): + color_seg[pred_seg == label, :] = color +color_seg = color_seg[, ::-1] # convert to BGR +img = np.array(image) * 0.5 + color_seg * 0.5 # plot the image with the segmentation map +img = img.astype(np.uint8) +plt.figure(figsize=(15, 10)) +plt.imshow(img) +plt.show() \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_17.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7e1710dccb2eb57d5df6bcb2e4315957a16f4fd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_17.txt @@ -0,0 +1 @@ +And then bring him back as another actor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_18.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fe1b5f6f0bfdd80acb07af5490c2d1be4f60993 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_18.txt @@ -0,0 +1 @@ +Jeeez! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_19.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8a36dee24294b1fd11821c6d4bf1535f9b9cfae --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_19.txt @@ -0,0 +1 @@ +Dallas all over again. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_20.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..67da263f150efcaeceb33b74b530ac3f998d3a4f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_20.txt @@ -0,0 +1,6 @@ +", +} + +There are two fields in this dataset: + +text: the movie review text. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_21.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1f20730667b3b9e90b3ad6bd17ab9c5b425f94a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_21.txt @@ -0,0 +1 @@ +label: a value that is either 0 for a negative review or 1 for a positive review. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_22.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..78b84fbea3062041d711b679a373012f270ac630 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_22.txt @@ -0,0 +1,12 @@ +Preprocess +The next step is to load a DistilBERT tokenizer to preprocess the text field: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased") + +Create a preprocessing function to tokenize text and truncate sequences to be no longer than DistilBERT's maximum input length: + +def preprocess_function(examples): + return tokenizer(examples["text"], truncation=True) + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [~datasets.Dataset.map] function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_23.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..d25a8235880d610cee5a1b027b70afeef1f81356 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_23.txt @@ -0,0 +1,4 @@ +You can speed up map by setting batched=True to process multiple elements of the dataset at once: +py +tokenized_imdb = imdb.map(preprocess_function, batched=True) +Now create a batch of examples using [DataCollatorWithPadding]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_24.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fa2f3dc3281d831e79faf4697f0fba60bd5c93d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_24.txt @@ -0,0 +1 @@ +It's more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_25.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..a44ad59f7e6176fae6a8244bd3871bfe5788d811 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_25.txt @@ -0,0 +1,9 @@ +from transformers import DataCollatorWithPadding +data_collator = DataCollatorWithPadding(tokenizer=tokenizer) + +py +from transformers import DataCollatorWithPadding +data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf") + +Evaluate +Including a metric during training is often helpful for evaluating your model's performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_26.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb54a12b788c6744523d6aed7df8543dd99480f2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_26.txt @@ -0,0 +1 @@ +You can quickly load a evaluation method with the 🤗 Evaluate library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_27.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d5f147cfbfa8389628e615d4376203a58f3efd5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_27.txt @@ -0,0 +1,14 @@ +For this task, load the accuracy metric (see the 🤗 Evaluate quick tour to learn more about how to load and compute a metric): + +import evaluate +accuracy = evaluate.load("accuracy") + +Then create a function that passes your predictions and labels to [~evaluate.EvaluationModule.compute] to calculate the accuracy: + +import numpy as np +def compute_metrics(eval_pred): + predictions, labels = eval_pred + predictions = np.argmax(predictions, axis=1) + return accuracy.compute(predictions=predictions, references=labels) + +Your compute_metrics function is ready to go now, and you'll return to it when you setup your training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_28.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0aa3c7e3a02dfecfbdddb11a61c1f9021d8c28d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_28.txt @@ -0,0 +1,7 @@ +Train +Before you start training your model, create a map of the expected ids to their labels with id2label and label2id: + +id2label = {0: "NEGATIVE", 1: "POSITIVE"} +label2id = {"NEGATIVE": 0, "POSITIVE": 1} + +If you aren't familiar with finetuning a model with the [Trainer], take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_29.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..b606df7180c2e2be472d63ede6d17d92a9485791 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_29.txt @@ -0,0 +1 @@ +You're ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_30.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..fffe435638f63ba9a1fba70117a3d786683887da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_30.txt @@ -0,0 +1,10 @@ +Load DistilBERT with [AutoModelForSequenceClassification] along with the number of expected labels, and the label mappings: + +from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer +model = AutoModelForSequenceClassification.from_pretrained( + "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id + ) + +At this point, only three steps remain: + +Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_31.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..be198e14862c0c12a30c53e206e6011dc146c468 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_31.txt @@ -0,0 +1 @@ +The only required parameter is output_dir which specifies where to save your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_32.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_32.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_33.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c977d29a14cc4699a4b06a477bb52c2490bdc8e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_33.txt @@ -0,0 +1 @@ +At the end of each epoch, the [Trainer] will evaluate the accuracy and save the training checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_34.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..643c2a1221f0298c2d0da3f12550f92d36f0b4a3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_34.txt @@ -0,0 +1 @@ +Pass the training arguments to [Trainer] along with the model, dataset, tokenizer, data collator, and compute_metrics function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_35.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_35.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_36.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..81fbb927939147b4e407e431fc764d4ede8e5d50 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_36.txt @@ -0,0 +1,24 @@ +training_args = TrainingArguments( + output_dir="my_awesome_model", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + evaluation_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + push_to_hub=True, + ) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_imdb["train"], + eval_dataset=tokenized_imdb["test"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) +trainer.train() + +[Trainer] applies dynamic padding by default when you pass tokenizer to it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_37.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc9367eb2bc54bd9d30047a0245e677a96bb1a35 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_37.txt @@ -0,0 +1 @@ +In this case, you don't need to specify a data collator explicitly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_38.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d8f78766b2cf40baa31eca27b8aa48bad61b85b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_38.txt @@ -0,0 +1,5 @@ +Once training is completed, share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_39.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fa2eb23dc5230c16c358c83dfea7a57664941ea --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_39.txt @@ -0,0 +1,33 @@ +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +from transformers import create_optimizer +import tensorflow as tf +batch_size = 16 +num_epochs = 5 +batches_per_epoch = len(tokenized_imdb["train"]) // batch_size +total_train_steps = int(batches_per_epoch * num_epochs) +optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps) + +Then you can load DistilBERT with [TFAutoModelForSequenceClassification] along with the number of expected labels, and the label mappings: + +from transformers import TFAutoModelForSequenceClassification +model = TFAutoModelForSequenceClassification.from_pretrained( + "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id + ) + +Convert your datasets to the tf.data.Dataset format with [~transformers.TFPreTrainedModel.prepare_tf_dataset]: + +tf_train_set = model.prepare_tf_dataset( + tokenized_imdb["train"], + shuffle=True, + batch_size=16, + collate_fn=data_collator, + ) +tf_validation_set = model.prepare_tf_dataset( + tokenized_imdb["test"], + shuffle=False, + batch_size=16, + collate_fn=data_collator, + ) + +Configure the model for training with compile. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_40.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cbbc39c9ffbdd5fa9f0d5eeb9a57372b57f83cd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_40.txt @@ -0,0 +1,4 @@ +Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +import tensorflow as tf +model.compile(optimizer=optimizer) # No loss argument! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_41.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..778523a4558517d06c3b5f23ba5eeb1897e7b0ef --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_41.txt @@ -0,0 +1 @@ +The last two things to setup before you start training is to compute the accuracy from the predictions, and provide a way to push your model to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_42.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..f69fb6436e3e5a775a94edf0a0ecab9dd3584f21 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_42.txt @@ -0,0 +1 @@ +Both are done by using Keras callbacks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_43.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ba3cbd8b5ad819ce967d3055a9ef00ea5ee5021 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_43.txt @@ -0,0 +1,18 @@ +Pass your compute_metrics function to [~transformers.KerasMetricCallback]: + +from transformers.keras_callbacks import KerasMetricCallback +metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) + +Specify where to push your model and tokenizer in the [~transformers.PushToHubCallback]: + +from transformers.keras_callbacks import PushToHubCallback +push_to_hub_callback = PushToHubCallback( + output_dir="my_awesome_model", + tokenizer=tokenizer, + ) + +Then bundle your callbacks together: + +callbacks = [metric_callback, push_to_hub_callback] + +Finally, you're ready to start training your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_44.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..3694dbd6905b52a14adc7290cc8bb3ae3f8ddfad --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_44.txt @@ -0,0 +1,5 @@ +Call fit with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: + +model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks) + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_45.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba217da41fc93d1e028366475958c161c4fc340d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_45.txt @@ -0,0 +1,3 @@ +For a more in-depth example of how to finetune a model for text classification, take a look at the corresponding +PyTorch notebook +or TensorFlow notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_46.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5ec9281024b79527faf7b28b94415e6100aee5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_46.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've finetuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_47.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8c415f0208d4be8cb8d97d06538536b38ea1999 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_47.txt @@ -0,0 +1,3 @@ +Grab some text you'd like to run inference on: + +text = "This was a masterpiece. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_48.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..c997fe7c25d31475071f9fc18bf94b6b25086621 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_48.txt @@ -0,0 +1 @@ +Not completely faithful to the books, but enthralling from beginning to end. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_49.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..01324ceb07ebffee50106b14a709514e3b3da15b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_49.txt @@ -0,0 +1 @@ +Might be my favorite of the three." \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_50.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c6dead06229ef8e450c4bcfad5f707e61359e29 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_50.txt @@ -0,0 +1 @@ +The simplest way to try out your finetuned model for inference is to use it in a [pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_sequence_classification/chunk_51.txt b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4bb2a6718f16d96915dbad7b323c7bd3a9f46ed --- /dev/null +++ b/chunked/content_aware_chunking/tasks_sequence_classification/chunk_51.txt @@ -0,0 +1,45 @@ +Instantiate a pipeline for sentiment analysis with your model, and pass your text to it: + +from transformers import pipeline +classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model") +classifier(text) +[{'label': 'POSITIVE', 'score': 0.9994940757751465}] + +You can also manually replicate the results of the pipeline if you'd like: + +Tokenize the text and return PyTorch tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model") +inputs = tokenizer(text, return_tensors="pt") + +Pass your inputs to the model and return the logits: + +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model") +with torch.no_grad(): + logits = model(**inputs).logits + +Get the class with the highest probability, and use the model's id2label mapping to convert it to a text label: + +predicted_class_id = logits.argmax().item() +model.config.id2label[predicted_class_id] +'POSITIVE' + +Tokenize the text and return TensorFlow tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model") +inputs = tokenizer(text, return_tensors="tf") + +Pass your inputs to the model and return the logits: + +from transformers import TFAutoModelForSequenceClassification +model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model") +logits = model(**inputs).logits + +Get the class with the highest probability, and use the model's id2label mapping to convert it to a text label: + +predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0]) +model.config.id2label[predicted_class_id] +'POSITIVE' \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_28.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_28.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_29.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..16db9690293d00074bf55b1f8791059c7eca140a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_29.txt @@ -0,0 +1 @@ +At the end of each epoch, the [Trainer] will evaluate the ROUGE metric and save the training checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_30.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..fcfd7b7e5d8763fbb80800abd33a0c33a328edf9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_30.txt @@ -0,0 +1 @@ +Pass the training arguments to [Seq2SeqTrainer] along with the model, dataset, tokenizer, data collator, and compute_metrics function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_31.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_31.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_32.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..d93c0e6fd3999b25930c613d673d30adc0af2d3e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_32.txt @@ -0,0 +1,29 @@ +training_args = Seq2SeqTrainingArguments( + output_dir="my_awesome_billsum_model", + evaluation_strategy="epoch", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + weight_decay=0.01, + save_total_limit=3, + num_train_epochs=4, + predict_with_generate=True, + fp16=True, + push_to_hub=True, + ) +trainer = Seq2SeqTrainer( + model=model, + args=training_args, + train_dataset=tokenized_billsum["train"], + eval_dataset=tokenized_billsum["test"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) +trainer.train() + +Once training is completed, share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_33.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..da3bfda03f6ab20c1169d54c0980574b00d09a4b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_33.txt @@ -0,0 +1,26 @@ +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +from transformers import create_optimizer, AdamWeightDecay +optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) + +Then you can load T5 with [TFAutoModelForSeq2SeqLM]: + +from transformers import TFAutoModelForSeq2SeqLM +model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint) + +Convert your datasets to the tf.data.Dataset format with [~transformers.TFPreTrainedModel.prepare_tf_dataset]: + +tf_train_set = model.prepare_tf_dataset( + tokenized_billsum["train"], + shuffle=True, + batch_size=16, + collate_fn=data_collator, + ) +tf_test_set = model.prepare_tf_dataset( + tokenized_billsum["test"], + shuffle=False, + batch_size=16, + collate_fn=data_collator, + ) + +Configure the model for training with compile. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_34.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cbbc39c9ffbdd5fa9f0d5eeb9a57372b57f83cd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_34.txt @@ -0,0 +1,4 @@ +Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +import tensorflow as tf +model.compile(optimizer=optimizer) # No loss argument! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_35.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..48370d77a743781bc8c110b59d99cc0a4391cbaf --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_35.txt @@ -0,0 +1 @@ +The last two things to setup before you start training is to compute the ROUGE score from the predictions, and provide a way to push your model to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_36.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..f69fb6436e3e5a775a94edf0a0ecab9dd3584f21 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_36.txt @@ -0,0 +1 @@ +Both are done by using Keras callbacks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_37.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..fec2f0b4c103b3f53b3d0912bf66eb26c992a7f2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_37.txt @@ -0,0 +1,18 @@ +Pass your compute_metrics function to [~transformers.KerasMetricCallback]: + +from transformers.keras_callbacks import KerasMetricCallback +metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) + +Specify where to push your model and tokenizer in the [~transformers.PushToHubCallback]: + +from transformers.keras_callbacks import PushToHubCallback +push_to_hub_callback = PushToHubCallback( + output_dir="my_awesome_billsum_model", + tokenizer=tokenizer, + ) + +Then bundle your callbacks together: + +callbacks = [metric_callback, push_to_hub_callback] + +Finally, you're ready to start training your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_38.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..053ce6bb5c418eb39cc4883d00e530f348b28091 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_38.txt @@ -0,0 +1,5 @@ +Call fit with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: + +model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks) + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_39.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..855f5ece4ff13af6188cc2be4f54730ec76ba21f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_39.txt @@ -0,0 +1,3 @@ +For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding +PyTorch notebook +or TensorFlow notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_40.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5ec9281024b79527faf7b28b94415e6100aee5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_40.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've finetuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_41.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..c314c65a91ce7ba9dfef2b2b4fb4f8d641b78667 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_41.txt @@ -0,0 +1 @@ +Come up with some text you'd like to summarize. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_42.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..723726f54e6765014f0e953f098361d2ad54edf6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_42.txt @@ -0,0 +1 @@ +For T5, you need to prefix your input depending on the task you're working on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_43.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..be4c546f51a941b4648ed554c53a3f30aa845126 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_43.txt @@ -0,0 +1,3 @@ +For summarization you should prefix your input as shown below: + +text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_44.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..8de3b1a6e66a6cf9c81f5e711f798da270d0187b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_44.txt @@ -0,0 +1 @@ +It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_45.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..89c966f51a4a04cc7cc4eaa59a85187ee1a30e4a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_45.txt @@ -0,0 +1 @@ +It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_46.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ee5d03c3e8c6fbbd838252ed0bab6483b7c175f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_46.txt @@ -0,0 +1 @@ +And no one making under $400,000 per year will pay a penny more in taxes." \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_47.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c6dead06229ef8e450c4bcfad5f707e61359e29 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_47.txt @@ -0,0 +1 @@ +The simplest way to try out your finetuned model for inference is to use it in a [pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_48.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..f783376238f9635cbceac34d44284032bee7c445 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_48.txt @@ -0,0 +1,6 @@ +Instantiate a pipeline for summarization with your model, and pass your text to it: + +from transformers import pipeline +summarizer = pipeline("summarization", model="stevhliu/my_awesome_billsum_model") +summarizer(text) +[{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_49.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e0828c6900df3b1c1429b23c16000f62b2b79aa --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_49.txt @@ -0,0 +1 @@ +It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}] \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_50.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba26eeaa3f16e6df6570cd727dd5f1a155c7dc3a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_50.txt @@ -0,0 +1,9 @@ +You can also manually replicate the results of the pipeline if you'd like: + +Tokenize the text and return the input_ids as PyTorch tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model") +inputs = tokenizer(text, return_tensors="pt").input_ids + +Use the [~transformers.generation_utils.GenerationMixin.generate] method to create the summarization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_51.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7e7527dc0ea43ab0e9406d5f641704a798a04c7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_51.txt @@ -0,0 +1 @@ +For more details about the different text generation strategies and parameters for controlling generation, check out the Text Generation API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_52.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcfcb3bbb2ab685713be7220926f04eace47e71d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_52.txt @@ -0,0 +1,8 @@ +from transformers import AutoModelForSeq2SeqLM +model = AutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model") +outputs = model.generate(inputs, max_new_tokens=100, do_sample=False) + +Decode the generated token ids back into text: + +tokenizer.decode(outputs[0], skip_special_tokens=True) +'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_53.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..baa7726b5073064d7f6e58e2380205dd49a2c221 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_53.txt @@ -0,0 +1 @@ +it's the most aggressive action on tackling the climate crisis in american history. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_54.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..07102d9c9c7828c50ffbdc03df7b536cdb41276f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_54.txt @@ -0,0 +1 @@ +it will ask the ultra-wealthy and corporations to pay their fair share.' \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_55.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..b48c8a9ee9660fa6e2d62b7ba168507c78e1240d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_55.txt @@ -0,0 +1,10 @@ +`` + + +Tokenize the text and return theinput_ids` as TensorFlow tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_billsum_model") +inputs = tokenizer(text, return_tensors="tf").input_ids + +Use the [~transformers.generation_tf_utils.TFGenerationMixin.generate] method to create the summarization. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_56.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7e7527dc0ea43ab0e9406d5f641704a798a04c7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_56.txt @@ -0,0 +1 @@ +For more details about the different text generation strategies and parameters for controlling generation, check out the Text Generation API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_57.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..69801ad8d76a08356df691b799d2fb938de0dd6c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_57.txt @@ -0,0 +1,8 @@ +from transformers import TFAutoModelForSeq2SeqLM +model = TFAutoModelForSeq2SeqLM.from_pretrained("stevhliu/my_awesome_billsum_model") +outputs = model.generate(inputs, max_new_tokens=100, do_sample=False) + +Decode the generated token ids back into text: + +tokenizer.decode(outputs[0], skip_special_tokens=True) +'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_58.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..baa7726b5073064d7f6e58e2380205dd49a2c221 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_58.txt @@ -0,0 +1 @@ +it's the most aggressive action on tackling the climate crisis in american history. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_summarization/chunk_59.txt b/chunked/content_aware_chunking/tasks_summarization/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..07102d9c9c7828c50ffbdc03df7b536cdb41276f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_summarization/chunk_59.txt @@ -0,0 +1 @@ +it will ask the ultra-wealthy and corporations to pay their fair share.' \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_100.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb2fcf12b6dd55e0df046ef1f24bfc4a58d0d89c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_100.txt @@ -0,0 +1 @@ +Increasing the training duration is also likely to enhance the quality of the results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_101.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..baa2804829d94715fd018a58a3e02d47e0792fbc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_101.txt @@ -0,0 +1,2 @@ +Even so, the speech clearly is Dutch instead of English, and it does +capture the voice characteristics of the speaker (compare to the original audio in the example). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_102.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d2fe71f2075ea294fcbe792a3d93c8c85f7bb4e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_102.txt @@ -0,0 +1 @@ +Another thing to experiment with is the model's configuration. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_103.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a1fdee6130e382f16129a0cce1f97825dce9c88 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_103.txt @@ -0,0 +1,2 @@ +For example, try using config.reduction_factor = 1 to +see if this improves the results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_104.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..018af7ab3bd323585c3bc691971f97e92363ec62 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_104.txt @@ -0,0 +1 @@ +Finally, it is essential to consider ethical considerations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_105.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..1542ad387e1f3dc54c17cbd4eaa00963a5bc2e5c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_105.txt @@ -0,0 +1,2 @@ +Although TTS technology has numerous useful applications, it +may also be used for malicious purposes, such as impersonating someone's voice without their knowledge or consent. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_106.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba55399d5a08a092089195bca2ad313ba06d2df8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_106.txt @@ -0,0 +1,2 @@ +Please +use TTS judiciously and responsibly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_41.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee460d4bf21312647ec94f7298fb9bc860ec60d0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_41.txt @@ -0,0 +1,2 @@ +With a total of 20,968 examples in the dataset, this information will give us a better understanding of the distribution of +speakers and examples in the data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_42.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..55a163f54be87e7df3322846e9f901a3eb38aef8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_42.txt @@ -0,0 +1,6 @@ +from collections import defaultdict +speaker_counts = defaultdict(int) +for speaker_id in dataset["speaker_id"]: + speaker_counts[speaker_id] += 1 + +By plotting a histogram you can get a sense of how much data there is for each speaker. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_43.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f94938fe200f8e0fa20b2346ac83d0793a4d6a6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_43.txt @@ -0,0 +1,9 @@ +import matplotlib.pyplot as plt +plt.figure() +plt.hist(speaker_counts.values(), bins=20) +plt.ylabel("Speakers") +plt.xlabel("Examples") +plt.show() + +The histogram reveals that approximately one-third of the speakers in the dataset have fewer than 100 examples, while +around ten speakers have more than 500 examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_44.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d0bea1ca933611e070c3dc4ff87874db4e7d9c4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_44.txt @@ -0,0 +1,2 @@ +To improve training efficiency and balance the dataset, we can limit +the data to speakers with between 100 and 400 examples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_45.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb2a50467e578388d5ce91230825d28370f19306 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_45.txt @@ -0,0 +1,15 @@ +def select_speaker(speaker_id): + return 100 <= speaker_counts[speaker_id] <= 400 +dataset = dataset.filter(select_speaker, input_columns=["speaker_id"]) + +Let's check how many speakers remain: + +len(set(dataset["speaker_id"])) +42 + +Let's see how many examples are left: + +len(dataset) +9973 + +You are left with just under 10,000 examples from approximately 40 unique speakers, which should be sufficient. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_46.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7c37e098403dede95df665007d419ea4ccc5d2c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_46.txt @@ -0,0 +1 @@ +Note that some speakers with few examples may actually have more audio available if the examples are long. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_47.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5a2447c296ddfc30e6905a5fee5dd3550de8852 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_47.txt @@ -0,0 +1,3 @@ +However, +determining the total amount of audio for each speaker requires scanning through the entire dataset, which is a +time-consuming process that involves loading and decoding each audio file. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_48.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..de65782048190b779c0fa13e928640e7a8fd804c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_48.txt @@ -0,0 +1 @@ +As such, we have chosen to skip this step here. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_49.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..9413489d334dff4c60f077979aa4313ae992220a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_49.txt @@ -0,0 +1,2 @@ +Speaker embeddings +To enable the TTS model to differentiate between multiple speakers, you'll need to create a speaker embedding for each example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_50.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..4353abf23367b1e05233557736f0a4fa5c934361 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_50.txt @@ -0,0 +1 @@ +The speaker embedding is an additional input into the model that captures a particular speaker's voice characteristics. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_51.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9a3966897143b4620474494821f02dbda02bb45 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_51.txt @@ -0,0 +1,2 @@ +To generate these speaker embeddings, use the pre-trained spkrec-xvect-voxceleb +model from SpeechBrain. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_52.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a061ea5fa934dc71cd5d0253cfc22242a155946 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_52.txt @@ -0,0 +1,2 @@ +Create a function create_speaker_embedding() that takes an input audio waveform and outputs a 512-element vector +containing the corresponding speaker embedding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_53.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c97df82b9391035393f73aa9cd0b2a5538702a9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_53.txt @@ -0,0 +1,19 @@ +import os +import torch +from speechbrain.pretrained import EncoderClassifier +spk_model_name = "speechbrain/spkrec-xvect-voxceleb" +device = "cuda" if torch.cuda.is_available() else "cpu" +speaker_model = EncoderClassifier.from_hparams( + source=spk_model_name, + run_opts={"device": device}, + savedir=os.path.join("/tmp", spk_model_name), + ) +def create_speaker_embedding(waveform): + with torch.no_grad(): + speaker_embeddings = speaker_model.encode_batch(torch.tensor(waveform)) + speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2) + speaker_embeddings = speaker_embeddings.squeeze().cpu().numpy() + return speaker_embeddings + +It's important to note that the speechbrain/spkrec-xvect-voxceleb model was trained on English speech from the VoxCeleb +dataset, whereas the training examples in this guide are in Dutch. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_54.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..d49e8141c67d99a45e0e09672f3e8739bd4614d3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_54.txt @@ -0,0 +1,2 @@ +While we believe that this model will still generate +reasonable speaker embeddings for our Dutch dataset, this assumption may not hold true in all cases. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_55.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..debf939af1f447934a5467ffdd937bec13e96465 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_55.txt @@ -0,0 +1 @@ +For optimal results, we recommend training an X-vector model on the target speech first. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_56.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bf423bcefbea626d564f505f211674b6e0b04c1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_56.txt @@ -0,0 +1,2 @@ +This will ensure that the model +is better able to capture the unique voice characteristics present in the Dutch language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_57.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..a811f9907916e2ab7c892737cda98ea6bc9984a6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_57.txt @@ -0,0 +1,2 @@ +Processing the dataset +Finally, let's process the data into the format the model expects. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_58.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab3d019a6288bb1d353020974b5b5d20088a29cf --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_58.txt @@ -0,0 +1,2 @@ +Create a prepare_dataset function that takes in a +single example and uses the SpeechT5Processor object to tokenize the input text and load the target audio into a log-mel spectrogram. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_59.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8b81a2077cfab6912b5c238fa56b791a7447b3b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_59.txt @@ -0,0 +1 @@ +It should also add the speaker embeddings as an additional input. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_60.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b474bdd44ada3d4bbcf40755b00c77eecae5f40 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_60.txt @@ -0,0 +1,27 @@ +def prepare_dataset(example): + audio = example["audio"] + + example = processor( + text=example["normalized_text"], + audio_target=audio["array"], + sampling_rate=audio["sampling_rate"], + return_attention_mask=False, + ) + # strip off the batch dimension + example["labels"] = example["labels"][0] + # use SpeechBrain to obtain x-vector + example["speaker_embeddings"] = create_speaker_embedding(audio["array"]) + return example + +Verify the processing is correct by looking at a single example: + +processed_example = prepare_dataset(dataset[0]) +list(processed_example.keys()) +['input_ids', 'labels', 'stop_labels', 'speaker_embeddings'] + +Speaker embeddings should be a 512-element vector: + +processed_example["speaker_embeddings"].shape +(512,) + +The labels should be a log-mel spectrogram with 80 mel bins. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_61.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..d52b875dca7f76652b1826ef71929c33e092d679 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_61.txt @@ -0,0 +1,7 @@ +import matplotlib.pyplot as plt +plt.figure() +plt.imshow(processed_example["labels"].T) +plt.show() + +Side note: If you find this spectrogram confusing, it may be due to your familiarity with the convention of placing low frequencies +at the bottom and high frequencies at the top of a plot. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_62.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff751fbecba6192b9c4d2988ebb1723caf1e0a0f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_62.txt @@ -0,0 +1,2 @@ +However, when plotting spectrograms as an image using the matplotlib library, +the y-axis is flipped and the spectrograms appear upside down. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_63.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..56d3719fdb6081b19d29754e153442d86690982b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_63.txt @@ -0,0 +1 @@ +Now apply the processing function to the entire dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_64.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..677a0b4c2b9776308195678514619e415d08e293 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_64.txt @@ -0,0 +1 @@ +This will take between 5 and 10 minutes. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_65.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b048e103307392a2fec09b71aaa46128b890e29 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_65.txt @@ -0,0 +1,3 @@ +dataset = dataset.map(prepare_dataset, remove_columns=dataset.column_names) + +You'll see a warning saying that some examples in the dataset are longer than the maximum input length the model can handle (600 tokens). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_66.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e74e37223f928bfb2f805cab7867b8dbd194eb2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_66.txt @@ -0,0 +1 @@ +Remove those examples from the dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_67.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..91ca77b2c7e1df9af6b3b4ecc2939b7c3d35ffef --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_67.txt @@ -0,0 +1 @@ +Here we go even further and to allow for larger batch sizes we remove anything over 200 tokens. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_68.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba7302f0e785e7052e690f247503107657cddcb1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_68.txt @@ -0,0 +1,13 @@ +def is_not_too_long(input_ids): + input_length = len(input_ids) + return input_length < 200 +dataset = dataset.filter(is_not_too_long, input_columns=["input_ids"]) +len(dataset) +8259 + +Next, create a basic train/test split: + +dataset = dataset.train_test_split(test_size=0.1) + +Data collator +In order to combine multiple examples into a batch, you need to define a custom data collator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_69.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..e25f86945eed2c7d5a807408d13bcb27dec3fbe8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_69.txt @@ -0,0 +1,2 @@ +This collator will pad shorter sequences with padding +tokens, ensuring that all examples have the same length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_70.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6478fd571a4b74cb8d79623dda86353bbbe915b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_70.txt @@ -0,0 +1 @@ +For the spectrogram labels, the padded portions are replaced with the special value -100. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_71.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..96ce7c60eb70b1f323aefda417cde094fc276496 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_71.txt @@ -0,0 +1,2 @@ +This special value +instructs the model to ignore that part of the spectrogram when calculating the spectrogram loss. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_72.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..2df7beb07e2c3d904b6de9f98ba85894bb15983a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_72.txt @@ -0,0 +1,29 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Union +@dataclass + class TTSDataCollatorWithPadding: + processor: Any + + def call(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]: + input_ids = [{"input_ids": feature["input_ids"]} for feature in features] + label_features = [{"input_values": feature["labels"]} for feature in features] + speaker_features = [feature["speaker_embeddings"] for feature in features] + # collate the inputs and targets into a batch + batch = processor.pad(input_ids=input_ids, labels=label_features, return_tensors="pt") + # replace padding with -100 to ignore loss correctly + batch["labels"] = batch["labels"].masked_fill(batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100) + # not used during fine-tuning + del batch["decoder_attention_mask"] + # round down target lengths to multiple of reduction factor + if model.config.reduction_factor > 1: + target_lengths = torch.tensor([len(feature["input_values"]) for feature in label_features]) + target_lengths = target_lengths.new( + [length - length % model.config.reduction_factor for length in target_lengths] + ) + max_length = max(target_lengths) + batch["labels"] = batch["labels"][:, :max_length] + # also add in the speaker embeddings + batch["speaker_embeddings"] = torch.tensor(speaker_features) + return batch + +In SpeechT5, the input to the decoder part of the model is reduced by a factor 2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_73.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5c06c996517b5e93757cb6510c95aff4ab1b673 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_73.txt @@ -0,0 +1,2 @@ +In other words, it throws away every +other timestep from the target sequence. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_74.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..9be648444f1613e03bdafe08e42e052d42061b47 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_74.txt @@ -0,0 +1 @@ +The decoder then predicts a sequence that is twice as long. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_75.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..f444f891d5549a2e5555237ad204051e24618797 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_75.txt @@ -0,0 +1,14 @@ +Since the original +target sequence length may be odd, the data collator makes sure to round the maximum length of the batch down to be a +multiple of 2. + + +data_collator = TTSDataCollatorWithPadding(processor=processor) + +Train the model +Load the pre-trained model from the same checkpoint as you used for loading the processor: + +from transformers import SpeechT5ForTextToSpeech +model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint) + +The use_cache=True option is incompatible with gradient checkpointing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_76.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..76deca19d9b5eb65a7c4c362ee6f44948df15a68 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_76.txt @@ -0,0 +1 @@ +Disable it for training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_77.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..69ca92b0727a927effcea542d14efb46a29ab0e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_77.txt @@ -0,0 +1,3 @@ +model.config.use_cache = False + +Define the training arguments. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_78.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdde5b79d4c7dcd5d710e459e8915c1ac842c02a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_78.txt @@ -0,0 +1 @@ +Here we are not computing any evaluation metrics during the training process. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_79.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..34c9b7a7cbdf534de9d6fc859d05ed907e3ba3bd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_79.txt @@ -0,0 +1,27 @@ +Instead, we'll +only look at the loss: +thon + +from transformers import Seq2SeqTrainingArguments +training_args = Seq2SeqTrainingArguments( + output_dir="speecht5_finetuned_voxpopuli_nl", # change to a repo name of your choice + per_device_train_batch_size=4, + gradient_accumulation_steps=8, + learning_rate=1e-5, + warmup_steps=500, + max_steps=4000, + gradient_checkpointing=True, + fp16=True, + evaluation_strategy="steps", + per_device_eval_batch_size=2, + save_steps=1000, + eval_steps=1000, + logging_steps=25, + report_to=["tensorboard"], + load_best_model_at_end=True, + greater_is_better=False, + label_names=["labels"], + push_to_hub=True, + ) + +Instantiate the Trainer object and pass the model, dataset, and data collator to it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_80.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0b0b30196c642af454089aec92ac9e3c8cb214b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_80.txt @@ -0,0 +1,11 @@ +from transformers import Seq2SeqTrainer +trainer = Seq2SeqTrainer( + args=training_args, + model=model, + train_dataset=dataset["train"], + eval_dataset=dataset["test"], + data_collator=data_collator, + tokenizer=processor, + ) + +And with that, you're ready to start training! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_81.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ee723b8449af0f225779ce7335322a7a0bb16ba --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_81.txt @@ -0,0 +1 @@ +Training will take several hours. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_82.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..8845fcff2e6a220b00f7faa7b776def624d4e75e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_82.txt @@ -0,0 +1,2 @@ +Depending on your GPU, +it is possible that you will encounter a CUDA "out-of-memory" error when you start training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_83.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..8623b8736f376d8e6aa10d23c8237085430232ad --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_83.txt @@ -0,0 +1,2 @@ +In this case, you can reduce +the per_device_train_batch_size incrementally by factors of 2 and increase gradient_accumulation_steps by 2x to compensate. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_84.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..71761f8480a64d911893f67adfa451f69cd3bf9d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_84.txt @@ -0,0 +1,13 @@ +trainer.train() + +To be able to use your checkpoint with a pipeline, make sure to save the processor with the checkpoint: + +processor.save_pretrained("YOUR_ACCOUNT_NAME/speecht5_finetuned_voxpopuli_nl") + +Push the final model to the 🤗 Hub: + +trainer.push_to_hub() + +Inference +Inference with a pipeline +Great, now that you've fine-tuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_85.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..620ae248761a121b111ef87425697215b3d32318 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_85.txt @@ -0,0 +1 @@ +First, let's see how you can use it with a corresponding pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_86.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..6baecaec24ee062791b830847264586432518184 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_86.txt @@ -0,0 +1,7 @@ +Let's create a "text-to-speech" pipeline with your +checkpoint: + +from transformers import pipeline +pipe = pipeline("text-to-speech", model="YOUR_ACCOUNT_NAME/speecht5_finetuned_voxpopuli_nl") + +Pick a piece of text in Dutch you'd like narrated, e.g. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_87.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8bd657f3572e0f35bf3e070d4e8f5f8f0054b8a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_87.txt @@ -0,0 +1,3 @@ +: + +text = "hallo allemaal, ik praat nederlands. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_88.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..74a94a3cf542b7207f23ff76e94d369550a33a4f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_88.txt @@ -0,0 +1 @@ +groetjes aan iedereen!" \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_89.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..91fa15aafaa602ffa82674b88993d3f67fdb1ae0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_89.txt @@ -0,0 +1 @@ +To use SpeechT5 with the pipeline, you'll need a speaker embedding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_90.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec1501d72a80888994f59112acfcf1dbacd4c1dc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_90.txt @@ -0,0 +1,21 @@ +Let's get it from an example in the test dataset: + +example = dataset["test"][304] +speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) + +Now you can pass the text and speaker embeddings to the pipeline, and it will take care of the rest: + +forward_params = {"speaker_embeddings": speaker_embeddings} +output = pipe(text, forward_params=forward_params) +output +{'audio': array([-6.82714235e-05, -4.26525949e-04, 1.06134125e-04, , + -1.22392643e-03, -7.76011671e-04, 3.29112721e-04], dtype=float32), + 'sampling_rate': 16000} + +You can then listen to the result: + +from IPython.display import Audio +Audio(output['audio'], rate=output['sampling_rate']) + +Run inference manually +You can achieve the same inference results without using the pipeline, however, more steps will be required. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_91.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9760d275d936b1add24f92b5dac28549f222927 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_91.txt @@ -0,0 +1,5 @@ +Load the model from the 🤗 Hub: + +model = SpeechT5ForTextToSpeech.from_pretrained("YOUR_ACCOUNT/speecht5_finetuned_voxpopuli_nl") + +Pick an example from the test dataset obtain a speaker embedding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_92.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb220137cf69c0c52529fbd457006b56b81713ec --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_92.txt @@ -0,0 +1,4 @@ +example = dataset["test"][304] +speaker_embeddings = torch.tensor(example["speaker_embeddings"]).unsqueeze(0) + +Define the input text and tokenize it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_93.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1c7dd48f705d4c1f88c7bcd7866e4642086bb18 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_93.txt @@ -0,0 +1 @@ +text = "hallo allemaal, ik praat nederlands. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_94.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..74a94a3cf542b7207f23ff76e94d369550a33a4f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_94.txt @@ -0,0 +1 @@ +groetjes aan iedereen!" \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_95.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..364caf80e70f06edb851a72bc25a2f85810b3aed --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_95.txt @@ -0,0 +1,13 @@ +inputs = processor(text=text, return_tensors="pt") + +Create a spectrogram with your model: + +spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) + +Visualize the spectrogram, if you'd like to: + +plt.figure() +plt.imshow(spectrogram.T) +plt.show() + +Finally, use the vocoder to turn the spectrogram into sound. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_96.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..29bb211492b4c312256436d22d0f02647fa8e798 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_96.txt @@ -0,0 +1,6 @@ +with torch.no_grad(): + speech = vocoder(spectrogram) +from IPython.display import Audio +Audio(speech.numpy(), rate=16000) + +In our experience, obtaining satisfactory results from this model can be challenging. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_97.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac078db02507d198dcb438c9bcc4628671b4a491 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_97.txt @@ -0,0 +1,2 @@ +The quality of the speaker +embeddings appears to be a significant factor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_98.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd3304653c5715af435e2823c05b6963f82e9769 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_98.txt @@ -0,0 +1,2 @@ +Since SpeechT5 was pre-trained with English x-vectors, it performs best +when using English speaker embeddings. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_text-to-speech/chunk_99.txt b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..13f28f102dd0252fad6e8a92a63b4a05dfb84402 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_text-to-speech/chunk_99.txt @@ -0,0 +1 @@ +If the synthesized speech sounds poor, try using a different speaker embedding. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_16.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..699ed3fb54f922b3c9af796805a4cc840481a53b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_16.txt @@ -0,0 +1 @@ +Assigning the label -100 to the special tokens [CLS] and [SEP] so they're ignored by the PyTorch loss function (see CrossEntropyLoss). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_17.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..d87c838650a0a471533b3d6cfce64c75b2617b74 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_17.txt @@ -0,0 +1 @@ +Only labeling the first token of a given word. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_18.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9e0423fd676963ed1b80bbcaa9c30c0cd879feb --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_18.txt @@ -0,0 +1 @@ +Assign -100 to other subtokens from the same word. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_19.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa8324bc71c507700f4912c9fcd11689d36a7cd4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_19.txt @@ -0,0 +1,8 @@ +Here is how you can create a function to realign the tokens and labels, and truncate sequences to be no longer than DistilBERT's maximum input length: + +def tokenize_and_align_labels(examples): + tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True) + + labels = [] + for i, label in enumerate(examples[f"ner_tags"]): + word_ids = tokenized_inputs.word_ids(batch_index=i) # Map tokens to their respective word. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_20.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..a15dca4e1a66897f36a91f1adf80ea599974ef78 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_20.txt @@ -0,0 +1,6 @@ +previous_word_idx = None + label_ids = [] + for word_idx in word_ids: # Set the special tokens to -100. + if word_idx is None: + label_ids.append(-100) + elif word_idx != previous_word_idx: # Only label the first token of a given word. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_21.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..042b25069d1a20ed3f3cca26b18d3813a637890d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_21.txt @@ -0,0 +1,9 @@ +label_ids.append(label[word_idx]) + else: + label_ids.append(-100) + previous_word_idx = word_idx + labels.append(label_ids) + tokenized_inputs["labels"] = labels + return tokenized_inputs + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [~datasets.Dataset.map] function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_22.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe8737c5f9a447e675066c1c2009deed4c221a20 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_22.txt @@ -0,0 +1,5 @@ +You can speed up the map function by setting batched=True to process multiple elements of the dataset at once: + +tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True) + +Now create a batch of examples using [DataCollatorWithPadding]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_23.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fa2f3dc3281d831e79faf4697f0fba60bd5c93d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_23.txt @@ -0,0 +1 @@ +It's more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_24.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..076a6b4a0aeb023f89f189bb57eae27c25c07823 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_24.txt @@ -0,0 +1,9 @@ +from transformers import DataCollatorForTokenClassification +data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer) + +py +from transformers import DataCollatorForTokenClassification +data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf") + +Evaluate +Including a metric during training is often helpful for evaluating your model's performance. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_25.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb54a12b788c6744523d6aed7df8543dd99480f2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_25.txt @@ -0,0 +1 @@ +You can quickly load a evaluation method with the 🤗 Evaluate library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_26.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..3228939ebb6dc729b5a9768cab090031c1e3fcef --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_26.txt @@ -0,0 +1 @@ +For this task, load the seqeval framework (see the 🤗 Evaluate quick tour to learn more about how to load and compute a metric). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_27.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d78bed871b5aa9d28d553e9afaff0488439e6310 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_27.txt @@ -0,0 +1 @@ +Seqeval actually produces several scores: precision, recall, F1, and accuracy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_28.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..133b90486b6f97cc9bcc4ec5b0b0a10a9724710c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_28.txt @@ -0,0 +1,28 @@ +import evaluate +seqeval = evaluate.load("seqeval") + +Get the NER labels first, and then create a function that passes your true predictions and true labels to [~evaluate.EvaluationModule.compute] to calculate the scores: + +import numpy as np +labels = [label_list[i] for i in example[f"ner_tags"]] +def compute_metrics(p): + predictions, labels = p + predictions = np.argmax(predictions, axis=2) + + true_predictions = [ + [label_list[p] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + true_labels = [ + [label_list[l] for (p, l) in zip(prediction, label) if l != -100] + for prediction, label in zip(predictions, labels) + ] + results = seqeval.compute(predictions=true_predictions, references=true_labels) + return { + "precision": results["overall_precision"], + "recall": results["overall_recall"], + "f1": results["overall_f1"], + "accuracy": results["overall_accuracy"], + } + +Your compute_metrics function is ready to go now, and you'll return to it when you setup your training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_29.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfcd08472aa33143d4cd8c4387cc418188aaa76a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_29.txt @@ -0,0 +1,35 @@ +Train +Before you start training your model, create a map of the expected ids to their labels with id2label and label2id: + +id2label = { + 0: "O", + 1: "B-corporation", + 2: "I-corporation", + 3: "B-creative-work", + 4: "I-creative-work", + 5: "B-group", + 6: "I-group", + 7: "B-location", + 8: "I-location", + 9: "B-person", + 10: "I-person", + 11: "B-product", + 12: "I-product", + } +label2id = { + "O": 0, + "B-corporation": 1, + "I-corporation": 2, + "B-creative-work": 3, + "I-creative-work": 4, + "B-group": 5, + "I-group": 6, + "B-location": 7, + "I-location": 8, + "B-person": 9, + "I-person": 10, + "B-product": 11, + "I-product": 12, + } + +If you aren't familiar with finetuning a model with the [Trainer], take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_30.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..b606df7180c2e2be472d63ede6d17d92a9485791 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_30.txt @@ -0,0 +1 @@ +You're ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_31.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fc97665c2d52ea624a807a44fe55ab0de592333 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_31.txt @@ -0,0 +1,10 @@ +Load DistilBERT with [AutoModelForTokenClassification] along with the number of expected labels, and the label mappings: + +from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer +model = AutoModelForTokenClassification.from_pretrained( + "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id + ) + +At this point, only three steps remain: + +Define your training hyperparameters in [TrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_32.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..be198e14862c0c12a30c53e206e6011dc146c468 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_32.txt @@ -0,0 +1 @@ +The only required parameter is output_dir which specifies where to save your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_33.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_33.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_34.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa4eea9e8fe1e67d6273319852e7554f608a6f5d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_34.txt @@ -0,0 +1 @@ +At the end of each epoch, the [Trainer] will evaluate the seqeval scores and save the training checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_35.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..643c2a1221f0298c2d0da3f12550f92d36f0b4a3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_35.txt @@ -0,0 +1 @@ +Pass the training arguments to [Trainer] along with the model, dataset, tokenizer, data collator, and compute_metrics function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_36.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_36.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_37.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b150e72d92ed24888c0da1d49220de91c8e674a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_37.txt @@ -0,0 +1,28 @@ +training_args = TrainingArguments( + output_dir="my_awesome_wnut_model", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + num_train_epochs=2, + weight_decay=0.01, + evaluation_strategy="epoch", + save_strategy="epoch", + load_best_model_at_end=True, + push_to_hub=True, + ) +trainer = Trainer( + model=model, + args=training_args, + train_dataset=tokenized_wnut["train"], + eval_dataset=tokenized_wnut["test"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) +trainer.train() + +Once training is completed, share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_38.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba8919d36f2d75bf648870d30004e0dbc747715c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_38.txt @@ -0,0 +1,36 @@ +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +from transformers import create_optimizer +batch_size = 16 +num_train_epochs = 3 +num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs +optimizer, lr_schedule = create_optimizer( + init_lr=2e-5, + num_train_steps=num_train_steps, + weight_decay_rate=0.01, + num_warmup_steps=0, + ) + +Then you can load DistilBERT with [TFAutoModelForTokenClassification] along with the number of expected labels, and the label mappings: + +from transformers import TFAutoModelForTokenClassification +model = TFAutoModelForTokenClassification.from_pretrained( + "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id + ) + +Convert your datasets to the tf.data.Dataset format with [~transformers.TFPreTrainedModel.prepare_tf_dataset]: + +tf_train_set = model.prepare_tf_dataset( + tokenized_wnut["train"], + shuffle=True, + batch_size=16, + collate_fn=data_collator, + ) +tf_validation_set = model.prepare_tf_dataset( + tokenized_wnut["validation"], + shuffle=False, + batch_size=16, + collate_fn=data_collator, + ) + +Configure the model for training with compile. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_39.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cbbc39c9ffbdd5fa9f0d5eeb9a57372b57f83cd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_39.txt @@ -0,0 +1,4 @@ +Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +import tensorflow as tf +model.compile(optimizer=optimizer) # No loss argument! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_40.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d60467b1a08d01bb4d0e65756bc1474b30a3949 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_40.txt @@ -0,0 +1 @@ +The last two things to setup before you start training is to compute the seqeval scores from the predictions, and provide a way to push your model to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_41.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..f69fb6436e3e5a775a94edf0a0ecab9dd3584f21 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_41.txt @@ -0,0 +1 @@ +Both are done by using Keras callbacks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_42.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..8faac2abca6709c6082161ec82f9218e6001ac83 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_42.txt @@ -0,0 +1,18 @@ +Pass your compute_metrics function to [~transformers.KerasMetricCallback]: + +from transformers.keras_callbacks import KerasMetricCallback +metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) + +Specify where to push your model and tokenizer in the [~transformers.PushToHubCallback]: + +from transformers.keras_callbacks import PushToHubCallback +push_to_hub_callback = PushToHubCallback( + output_dir="my_awesome_wnut_model", + tokenizer=tokenizer, + ) + +Then bundle your callbacks together: + +callbacks = [metric_callback, push_to_hub_callback] + +Finally, you're ready to start training your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_43.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..3694dbd6905b52a14adc7290cc8bb3ae3f8ddfad --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_43.txt @@ -0,0 +1,5 @@ +Call fit with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: + +model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks) + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_44.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5a46c95fe0516070b2920d3291731ae72a7745e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_44.txt @@ -0,0 +1,3 @@ +For a more in-depth example of how to finetune a model for token classification, take a look at the corresponding +PyTorch notebook +or TensorFlow notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_45.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5ec9281024b79527faf7b28b94415e6100aee5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_45.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've finetuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_46.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cbeed858569e91a50374cf5aeb02a429f6a76d8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_46.txt @@ -0,0 +1,3 @@ +Grab some text you'd like to run inference on: + +text = "The Golden State Warriors are an American professional basketball team based in San Francisco." \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_47.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c6dead06229ef8e450c4bcfad5f707e61359e29 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_47.txt @@ -0,0 +1 @@ +The simplest way to try out your finetuned model for inference is to use it in a [pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_token_classification/chunk_48.txt b/chunked/content_aware_chunking/tasks_token_classification/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..55d44c351515ecd61e7aaf04193eee85b5444c83 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_token_classification/chunk_48.txt @@ -0,0 +1,108 @@ +Instantiate a pipeline for NER with your model, and pass your text to it: + +from transformers import pipeline +classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model") +classifier(text) +[{'entity': 'B-location', + 'score': 0.42658573, + 'index': 2, + 'word': 'golden', + 'start': 4, + 'end': 10}, + {'entity': 'I-location', + 'score': 0.35856336, + 'index': 3, + 'word': 'state', + 'start': 11, + 'end': 16}, + {'entity': 'B-group', + 'score': 0.3064001, + 'index': 4, + 'word': 'warriors', + 'start': 17, + 'end': 25}, + {'entity': 'B-location', + 'score': 0.65523505, + 'index': 13, + 'word': 'san', + 'start': 80, + 'end': 83}, + {'entity': 'B-location', + 'score': 0.4668663, + 'index': 14, + 'word': 'francisco', + 'start': 84, + 'end': 93}] + +You can also manually replicate the results of the pipeline if you'd like: + +Tokenize the text and return PyTorch tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model") +inputs = tokenizer(text, return_tensors="pt") + +Pass your inputs to the model and return the logits: + +from transformers import AutoModelForTokenClassification +model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model") +with torch.no_grad(): + logits = model(**inputs).logits + +Get the class with the highest probability, and use the model's id2label mapping to convert it to a text label: + +predictions = torch.argmax(logits, dim=2) +predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]] +predicted_token_class +['O', + 'O', + 'B-location', + 'I-location', + 'B-group', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'B-location', + 'B-location', + 'O', + 'O'] + +Tokenize the text and return TensorFlow tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model") +inputs = tokenizer(text, return_tensors="tf") + +Pass your inputs to the model and return the logits: + +from transformers import TFAutoModelForTokenClassification +model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model") +logits = model(**inputs).logits + +Get the class with the highest probability, and use the model's id2label mapping to convert it to a text label: + +predicted_token_class_ids = tf.math.argmax(logits, axis=-1) +predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()] +predicted_token_class +['O', + 'O', + 'B-location', + 'I-location', + 'B-group', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'O', + 'B-location', + 'B-location', + 'O', + 'O'] \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_15.txt b/chunked/content_aware_chunking/tasks_translation/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b606df7180c2e2be472d63ede6d17d92a9485791 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_15.txt @@ -0,0 +1 @@ +You're ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_16.txt b/chunked/content_aware_chunking/tasks_translation/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a03c1ea2f90f0526b4aa3bc2312d235a74e9e44 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_16.txt @@ -0,0 +1,8 @@ +Load T5 with [AutoModelForSeq2SeqLM]: + +from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer +model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) + +At this point, only three steps remain: + +Define your training hyperparameters in [Seq2SeqTrainingArguments]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_17.txt b/chunked/content_aware_chunking/tasks_translation/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..be198e14862c0c12a30c53e206e6011dc146c468 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_17.txt @@ -0,0 +1 @@ +The only required parameter is output_dir which specifies where to save your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_18.txt b/chunked/content_aware_chunking/tasks_translation/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac8b43f2f1c246b56f9b8886d6c1ee6f674c1da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_18.txt @@ -0,0 +1 @@ +You'll push this model to the Hub by setting push_to_hub=True (you need to be signed in to Hugging Face to upload your model). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_19.txt b/chunked/content_aware_chunking/tasks_translation/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..092351196fa6d8f7ff4822100c0bc73f88ef1351 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_19.txt @@ -0,0 +1 @@ +At the end of each epoch, the [Trainer] will evaluate the SacreBLEU metric and save the training checkpoint. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_20.txt b/chunked/content_aware_chunking/tasks_translation/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..fcfd7b7e5d8763fbb80800abd33a0c33a328edf9 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_20.txt @@ -0,0 +1 @@ +Pass the training arguments to [Seq2SeqTrainer] along with the model, dataset, tokenizer, data collator, and compute_metrics function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_21.txt b/chunked/content_aware_chunking/tasks_translation/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d16ad4872f10697b15d5b44fdda3cf09e78483e1 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_21.txt @@ -0,0 +1 @@ +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_22.txt b/chunked/content_aware_chunking/tasks_translation/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..305f4ddedb06fdfa4d6fa11ab0bf5683e50e69c2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_22.txt @@ -0,0 +1,29 @@ +training_args = Seq2SeqTrainingArguments( + output_dir="my_awesome_opus_books_model", + evaluation_strategy="epoch", + learning_rate=2e-5, + per_device_train_batch_size=16, + per_device_eval_batch_size=16, + weight_decay=0.01, + save_total_limit=3, + num_train_epochs=2, + predict_with_generate=True, + fp16=True, + push_to_hub=True, + ) +trainer = Seq2SeqTrainer( + model=model, + args=training_args, + train_dataset=tokenized_books["train"], + eval_dataset=tokenized_books["test"], + tokenizer=tokenizer, + data_collator=data_collator, + compute_metrics=compute_metrics, + ) +trainer.train() + +Once training is completed, share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +If you aren't familiar with finetuning a model with Keras, take a look at the basic tutorial here! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_23.txt b/chunked/content_aware_chunking/tasks_translation/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..4eb062e132750d846c3b9e6dada0a5d6726cf604 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_23.txt @@ -0,0 +1,26 @@ +To finetune a model in TensorFlow, start by setting up an optimizer function, learning rate schedule, and some training hyperparameters: + +from transformers import AdamWeightDecay +optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01) + +Then you can load T5 with [TFAutoModelForSeq2SeqLM]: + +from transformers import TFAutoModelForSeq2SeqLM +model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint) + +Convert your datasets to the tf.data.Dataset format with [~transformers.TFPreTrainedModel.prepare_tf_dataset]: + +tf_train_set = model.prepare_tf_dataset( + tokenized_books["train"], + shuffle=True, + batch_size=16, + collate_fn=data_collator, + ) +tf_test_set = model.prepare_tf_dataset( + tokenized_books["test"], + shuffle=False, + batch_size=16, + collate_fn=data_collator, + ) + +Configure the model for training with compile. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_24.txt b/chunked/content_aware_chunking/tasks_translation/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cbbc39c9ffbdd5fa9f0d5eeb9a57372b57f83cd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_24.txt @@ -0,0 +1,4 @@ +Note that Transformers models all have a default task-relevant loss function, so you don't need to specify one unless you want to: + +import tensorflow as tf +model.compile(optimizer=optimizer) # No loss argument! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_25.txt b/chunked/content_aware_chunking/tasks_translation/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..10c265e65863b2c52f1db5f9b602977feddb9e2e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_25.txt @@ -0,0 +1 @@ +The last two things to setup before you start training is to compute the SacreBLEU metric from the predictions, and provide a way to push your model to the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_26.txt b/chunked/content_aware_chunking/tasks_translation/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..f69fb6436e3e5a775a94edf0a0ecab9dd3584f21 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_26.txt @@ -0,0 +1 @@ +Both are done by using Keras callbacks. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_27.txt b/chunked/content_aware_chunking/tasks_translation/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2f4dca3e2d23825d0c2217bbe34371d8f6d50ec --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_27.txt @@ -0,0 +1,18 @@ +Pass your compute_metrics function to [~transformers.KerasMetricCallback]: + +from transformers.keras_callbacks import KerasMetricCallback +metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set) + +Specify where to push your model and tokenizer in the [~transformers.PushToHubCallback]: + +from transformers.keras_callbacks import PushToHubCallback +push_to_hub_callback = PushToHubCallback( + output_dir="my_awesome_opus_books_model", + tokenizer=tokenizer, + ) + +Then bundle your callbacks together: + +callbacks = [metric_callback, push_to_hub_callback] + +Finally, you're ready to start training your model! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_28.txt b/chunked/content_aware_chunking/tasks_translation/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..053ce6bb5c418eb39cc4883d00e530f348b28091 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_28.txt @@ -0,0 +1,5 @@ +Call fit with your training and validation datasets, the number of epochs, and your callbacks to finetune the model: + +model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks) + +Once training is completed, your model is automatically uploaded to the Hub so everyone can use it! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_29.txt b/chunked/content_aware_chunking/tasks_translation/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed9f82269abd0d6ff312262e2446f56c2b995a31 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_29.txt @@ -0,0 +1,3 @@ +For a more in-depth example of how to finetune a model for translation, take a look at the corresponding +PyTorch notebook +or TensorFlow notebook. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_30.txt b/chunked/content_aware_chunking/tasks_translation/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec5ec9281024b79527faf7b28b94415e6100aee5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_30.txt @@ -0,0 +1,2 @@ +Inference +Great, now that you've finetuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_31.txt b/chunked/content_aware_chunking/tasks_translation/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b929b5ce10963761e403cba20044aef1e98a7b3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_31.txt @@ -0,0 +1 @@ +Come up with some text you'd like to translate to another language. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_32.txt b/chunked/content_aware_chunking/tasks_translation/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..723726f54e6765014f0e953f098361d2ad54edf6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_32.txt @@ -0,0 +1 @@ +For T5, you need to prefix your input depending on the task you're working on. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_33.txt b/chunked/content_aware_chunking/tasks_translation/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3f4c57a92504627a6cd32c275c13ef51a967174 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_33.txt @@ -0,0 +1,3 @@ +For translation from English to French, you should prefix your input as shown below: + +text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria." \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_34.txt b/chunked/content_aware_chunking/tasks_translation/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c6dead06229ef8e450c4bcfad5f707e61359e29 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_34.txt @@ -0,0 +1 @@ +The simplest way to try out your finetuned model for inference is to use it in a [pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_35.txt b/chunked/content_aware_chunking/tasks_translation/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c949e96f093b06399bab51824b3830519ab8479 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_35.txt @@ -0,0 +1,6 @@ +Instantiate a pipeline for translation with your model, and pass your text to it: + +from transformers import pipeline +translator = pipeline("translation", model="my_awesome_opus_books_model") +translator(text) +[{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}] \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_36.txt b/chunked/content_aware_chunking/tasks_translation/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..056f1c4e8c1d01bd377bd653bad4f35d50068542 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_36.txt @@ -0,0 +1,9 @@ +You can also manually replicate the results of the pipeline if you'd like: + +Tokenize the text and return the input_ids as PyTorch tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model") +inputs = tokenizer(text, return_tensors="pt").input_ids + +Use the [~transformers.generation_utils.GenerationMixin.generate] method to create the translation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_37.txt b/chunked/content_aware_chunking/tasks_translation/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7e7527dc0ea43ab0e9406d5f641704a798a04c7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_37.txt @@ -0,0 +1 @@ +For more details about the different text generation strategies and parameters for controlling generation, check out the Text Generation API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_38.txt b/chunked/content_aware_chunking/tasks_translation/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1165c9d9444a7d6d8d91c323b6a19f8172caa3d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_38.txt @@ -0,0 +1,8 @@ +from transformers import AutoModelForSeq2SeqLM +model = AutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model") +outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) + +Decode the generated token ids back into text: + +tokenizer.decode(outputs[0], skip_special_tokens=True) +'Les lignées partagent des ressources avec des bactéries enfixant l'azote.' \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_39.txt b/chunked/content_aware_chunking/tasks_translation/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fd187e6b326d0f30f8af5aaba056d23e9004fed --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_39.txt @@ -0,0 +1,10 @@ +`` + + +Tokenize the text and return theinput_ids` as TensorFlow tensors: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("my_awesome_opus_books_model") +inputs = tokenizer(text, return_tensors="tf").input_ids + +Use the [~transformers.generation_tf_utils.TFGenerationMixin.generate] method to create the translation. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_40.txt b/chunked/content_aware_chunking/tasks_translation/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7e7527dc0ea43ab0e9406d5f641704a798a04c7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_40.txt @@ -0,0 +1 @@ +For more details about the different text generation strategies and parameters for controlling generation, check out the Text Generation API. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_translation/chunk_41.txt b/chunked/content_aware_chunking/tasks_translation/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..2576ce729f764e1c8cee119cd724d514766f51a7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_translation/chunk_41.txt @@ -0,0 +1,8 @@ +from transformers import TFAutoModelForSeq2SeqLM +model = TFAutoModelForSeq2SeqLM.from_pretrained("my_awesome_opus_books_model") +outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95) + +Decode the generated token ids back into text: + +tokenizer.decode(outputs[0], skip_special_tokens=True) +'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.' \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_27.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8fcf03866f0c1520f2aabad54ac4a90499903e6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_27.txt @@ -0,0 +1 @@ +You can check out this checkpoint which was obtained by fine-tuning MCG-NJU/videomae-base-finetuned-kinetics. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_28.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..c11a7ebbe0780352a453bba7a017a5dfd5d88dff --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_28.txt @@ -0,0 +1,2 @@ +Prepare the datasets for training +For preprocessing the videos, you will leverage the PyTorchVideo library. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_29.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..37d9231c8c00c93da1c3976ad697e302612ca9ba --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_29.txt @@ -0,0 +1 @@ +Start by importing the dependencies we need. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_30.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..449ee90a6c78336922b97ab9aee75cb6528a9dd8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_30.txt @@ -0,0 +1,18 @@ +import pytorchvideo.data +from pytorchvideo.transforms import ( + ApplyTransformToKey, + Normalize, + RandomShortSideScale, + RemoveKey, + ShortSideScale, + UniformTemporalSubsample, + ) +from torchvision.transforms import ( + Compose, + Lambda, + RandomCrop, + RandomHorizontalFlip, + Resize, + ) + +For the training dataset transformations, use a combination of uniform temporal subsampling, pixel normalization, random cropping, and random horizontal flipping. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_31.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ada080183227c156abd4fe8fe3c7e4c5d29c705 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_31.txt @@ -0,0 +1 @@ +For the validation and evaluation dataset transformations, keep the same transformation chain except for random cropping and horizontal flipping. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_32.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..bac6e34f33913ee1cf717c0257db5156933c9a81 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_32.txt @@ -0,0 +1 @@ +To learn more about the details of these transformations check out the official documentation of PyTorchVideo. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_33.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e5322d5b67339d8dee710e697cee6d03c7e8ae0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_33.txt @@ -0,0 +1,3 @@ +Use the image_processor associated with the pre-trained model to obtain the following information: + +Image mean and standard deviation with which the video frame pixels will be normalized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_34.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd37088f95b7463ad7a93be9ad8d0e7141c510fc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_34.txt @@ -0,0 +1 @@ +Spatial resolution to which the video frames will be resized. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_35.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..789268dac02d0b8851b25700ac1b7387805beb0f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_35.txt @@ -0,0 +1 @@ +Start by defining some constants. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_36.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..7668d0745afa7abc2173c4fb2e8c8360022bbc27 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_36.txt @@ -0,0 +1,14 @@ +mean = image_processor.image_mean +std = image_processor.image_std +if "shortest_edge" in image_processor.size: + height = width = image_processor.size["shortest_edge"] +else: + height = image_processor.size["height"] + width = image_processor.size["width"] +resize_to = (height, width) +num_frames_to_sample = model.config.num_frames +sample_rate = 4 +fps = 30 +clip_duration = num_frames_to_sample * sample_rate / fps + +Now, define the dataset-specific transformations and the datasets respectively. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_37.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9020a3982b55b20994a486bd4f9567207bce174 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_37.txt @@ -0,0 +1,59 @@ +Starting with the training set: + + +train_transform = Compose( + [ + ApplyTransformToKey( + key="video", + transform=Compose( + [ + UniformTemporalSubsample(num_frames_to_sample), + Lambda(lambda x: x / 255.0), + Normalize(mean, std), + RandomShortSideScale(min_size=256, max_size=320), + RandomCrop(resize_to), + RandomHorizontalFlip(p=0.5), + ] + ), + ), + ] + ) +train_dataset = pytorchvideo.data.Ucf101( + data_path=os.path.join(dataset_root_path, "train"), + clip_sampler=pytorchvideo.data.make_clip_sampler("random", clip_duration), + decode_audio=False, + transform=train_transform, + ) + +The same sequence of workflow can be applied to the validation and evaluation sets: + + +val_transform = Compose( + [ + ApplyTransformToKey( + key="video", + transform=Compose( + [ + UniformTemporalSubsample(num_frames_to_sample), + Lambda(lambda x: x / 255.0), + Normalize(mean, std), + Resize(resize_to), + ] + ), + ), + ] + ) +val_dataset = pytorchvideo.data.Ucf101( + data_path=os.path.join(dataset_root_path, "val"), + clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration), + decode_audio=False, + transform=val_transform, + ) +test_dataset = pytorchvideo.data.Ucf101( + data_path=os.path.join(dataset_root_path, "test"), + clip_sampler=pytorchvideo.data.make_clip_sampler("uniform", clip_duration), + decode_audio=False, + transform=val_transform, + ) + +Note: The above dataset pipelines are taken from the official PyTorchVideo example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_38.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b5929e4c5fa33f214f238800eac3fca400427ca --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_38.txt @@ -0,0 +1 @@ +We're using the pytorchvideo.data.Ucf101() function because it's tailored for the UCF-101 dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_39.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..54e7f15a28a300f499ab94af4db79934e7f3116f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_39.txt @@ -0,0 +1 @@ +Under the hood, it returns a pytorchvideo.data.labeled_video_dataset.LabeledVideoDataset object. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_40.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..86ec57905eb573407d1f3ef5648c408e6d21d986 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_40.txt @@ -0,0 +1 @@ +LabeledVideoDataset class is the base class for all things video in the PyTorchVideo dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_41.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cc6a5da620f02e6e5d769a25b9f8d9573a4f2a3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_41.txt @@ -0,0 +1 @@ +So, if you want to use a custom dataset not supported off-the-shelf by PyTorchVideo, you can extend the LabeledVideoDataset class accordingly. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_42.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..c2aa8936e6783d6c53c2fcc593bcad7018ed5b76 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_42.txt @@ -0,0 +1 @@ +Refer to the data API documentation to learn more. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_43.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..b29ce2717979ce3fb0d2fcd38946ac919ee5f2ed --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_43.txt @@ -0,0 +1 @@ +Also, if your dataset follows a similar structure (as shown above), then using the pytorchvideo.data.Ucf101() should work just fine. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_44.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d57ca8c5d1446fd9aee0c0a21ec2d698f09023e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_44.txt @@ -0,0 +1 @@ +You can access the num_videos argument to know the number of videos in the dataset. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_45.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..f671de35737e30af3dbb7b72575ba47d18455805 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_45.txt @@ -0,0 +1,12 @@ +print(train_dataset.num_videos, val_dataset.num_videos, test_dataset.num_videos) + +(300, 30, 75) + +Visualize the preprocessed video for better debugging + + +import imageio +import numpy as np +from IPython.display import Image +def unnormalize_img(img): + """Un-normalizes the image pixels.""" \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_46.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cb499c2eeb651404bc54fdc52ed3a2fda74e3fd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_46.txt @@ -0,0 +1,5 @@ +img = (img * std) + mean + img = (img * 255).astype("uint8") + return img.clip(0, 255) +def create_gif(video_tensor, filename="sample.gif"): + """Prepares a GIF from a video tensor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_47.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..f10e5d4a17b18ed6a95d8fd1c89a60658eb8ddba --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_47.txt @@ -0,0 +1,3 @@ +The video tensor is expected to have the following shape: + (num_frames, num_channels, height, width). + """ \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_48.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..727b5830f81cf339837fd790f849c2ac2f6d5a37 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_48.txt @@ -0,0 +1,9 @@ +frames = [] + for video_frame in video_tensor: + frame_unnormalized = unnormalize_img(video_frame.permute(1, 2, 0).numpy()) + frames.append(frame_unnormalized) + kargs = {"duration": 0.25} + imageio.mimsave(filename, frames, "GIF", **kargs) + return filename +def display_gif(video_tensor, gif_name="sample.gif"): + """Prepares and displays a GIF from a video tensor.""" \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_49.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..a34b9044fe1e680253308eb112e3bf23073d456a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_49.txt @@ -0,0 +1,9 @@ +video_tensor = video_tensor.permute(1, 0, 2, 3) + gif_filename = create_gif(video_tensor, gif_name) + return Image(filename=gif_filename) +sample_video = next(iter(train_dataset)) +video_tensor = sample_video["video"] +display_gif(video_tensor) + +Train the model +Leverage Trainer from 🤗 Transformers for training the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_50.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdc8a2c9d2b67da9b93b78e85a631d4011a510d4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_50.txt @@ -0,0 +1 @@ +To instantiate a Trainer, you need to define the training configuration and an evaluation metric. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_51.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..50bed8ae6d79ec1d770f2a45cf961d5ae6fe3510 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_51.txt @@ -0,0 +1 @@ +The most important is the TrainingArguments, which is a class that contains all the attributes to configure the training. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_52.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7314a30bfd6e190bfbf64c61347ed80774d792e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_52.txt @@ -0,0 +1 @@ +It requires an output folder name, which will be used to save the checkpoints of the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_53.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f208c531150b5b5d2ff3f38a571c1ced479f01c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_53.txt @@ -0,0 +1 @@ +It also helps sync all the information in the model repository on 🤗 Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_54.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..5597e9a476695c92c792bbf6af5d38dd1dfff576 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_54.txt @@ -0,0 +1 @@ +Most of the training arguments are self-explanatory, but one that is quite important here is remove_unused_columns=False. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_55.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cc78bc463b116e059ddb6c423359f19b2eb8238 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_55.txt @@ -0,0 +1 @@ +This one will drop any features not used by the model's call function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_56.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9708726f41576b0d5e6036f3d9adba3f5c38c85 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_56.txt @@ -0,0 +1 @@ +By default it's True because usually it's ideal to drop unused feature columns, making it easier to unpack inputs into the model's call function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_57.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..14295f0f07f04b2af54ea1809fab9caced73b4e8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_57.txt @@ -0,0 +1 @@ +But, in this case, you need the unused features ('video' in particular) in order to create pixel_values (which is a mandatory key our model expects in its inputs). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_58.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..8977ddf2b50cb545ecfbc482ae39a53bcbfb9809 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_58.txt @@ -0,0 +1,21 @@ +from transformers import TrainingArguments, Trainer +model_name = model_ckpt.split("/")[-1] +new_model_name = f"{model_name}-finetuned-ucf101-subset" +num_epochs = 4 +args = TrainingArguments( + new_model_name, + remove_unused_columns=False, + evaluation_strategy="epoch", + save_strategy="epoch", + learning_rate=5e-5, + per_device_train_batch_size=batch_size, + per_device_eval_batch_size=batch_size, + warmup_ratio=0.1, + logging_steps=10, + load_best_model_at_end=True, + metric_for_best_model="accuracy", + push_to_hub=True, + max_steps=(train_dataset.num_videos // batch_size) * num_epochs, + ) + +The dataset returned by pytorchvideo.data.Ucf101() doesn't implement the __len__ method. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_59.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9956e41dc8defde8947f1bb29254454de9c20d5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_59.txt @@ -0,0 +1 @@ +As such, we must define max_steps when instantiating TrainingArguments. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_60.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..debbb29caf8fb025483e33ae7093cfefbf390b64 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_60.txt @@ -0,0 +1 @@ +Next, you need to define a function to compute the metrics from the predictions, which will use the metric you'll load now. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_61.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..707476886174aa2fc1aa07289acb1e02bedb5089 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_61.txt @@ -0,0 +1,10 @@ +The only preprocessing you have to do is to take the argmax of our predicted logits: + +import evaluate +metric = evaluate.load("accuracy") +def compute_metrics(eval_pred): + predictions = np.argmax(eval_pred.predictions, axis=1) + return metric.compute(predictions=predictions, references=eval_pred.label_ids) + +A note on evaluation: +In the VideoMAE paper, the authors use the following evaluation strategy. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_62.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..47a01e84d64618f60a4a124bcee02c8811129281 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_62.txt @@ -0,0 +1 @@ +They evaluate the model on several clips from test videos and apply different crops to those clips and report the aggregate score. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_63.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..61ba52f085dc31029fd9ba816e4dd2cce589f3ed --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_63.txt @@ -0,0 +1 @@ +However, in the interest of simplicity and brevity, we don't consider that in this tutorial. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_64.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fc394326c109a8539e7e70da13a4e0ad14ec9b6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_64.txt @@ -0,0 +1 @@ +Also, define a collate_fn, which will be used to batch examples together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_65.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e459e4f4da7776d8f43b3d52a6d32d8702ab39e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_65.txt @@ -0,0 +1 @@ +Each batch consists of 2 keys, namely pixel_values and labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_66.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..374d93d2e8ca1b6a9b5cc9084081205cf36d60a8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_66.txt @@ -0,0 +1,22 @@ +def collate_fn(examples): + # permute to (num_frames, num_channels, height, width) + pixel_values = torch.stack( + [example["video"].permute(1, 0, 2, 3) for example in examples] + ) + labels = torch.tensor([example["label"] for example in examples]) + return {"pixel_values": pixel_values, "labels": labels} + +Then you just pass all of this along with the datasets to Trainer: + + +trainer = Trainer( + model, + args, + train_dataset=train_dataset, + eval_dataset=val_dataset, + tokenizer=image_processor, + compute_metrics=compute_metrics, + data_collator=collate_fn, + ) + +You might wonder why you passed along the image_processor as a tokenizer when you preprocessed the data already. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_67.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..392629e25a65eb6ea201821cf68b8773b7787c3d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_67.txt @@ -0,0 +1 @@ +This is only to make sure the image processor configuration file (stored as JSON) will also be uploaded to the repo on the Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_68.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..829d9e28c9bd2088e12dfa7e4fee799098653db3 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_68.txt @@ -0,0 +1,11 @@ +Now fine-tune our model by calling the train method: + + +train_results = trainer.train() + +Once training is completed, share your model to the Hub with the [~transformers.Trainer.push_to_hub] method so everyone can use your model: + +trainer.push_to_hub() + +Inference +Great, now that you have fine-tuned a model, you can use it for inference! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_69.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f86f7fd24e316e5fefa45ff991deb2df43707f2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_69.txt @@ -0,0 +1,6 @@ +Load a video for inference: + + +sample_test_video = next(iter(test_dataset)) + +The simplest way to try out your fine-tuned model for inference is to use it in a pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_70.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4a6f0dae0330ce5cad626be3722f266335cad57 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_70.txt @@ -0,0 +1,12 @@ +Instantiate a pipeline for video classification with your model, and pass your video to it: + +from transformers import pipeline +video_cls = pipeline(model="my_awesome_video_cls_model") +video_cls("https://huggingface.co/datasets/sayakpaul/ucf101-subset/resolve/main/v_BasketballDunk_g14_c06.avi") +[{'score': 0.9272987842559814, 'label': 'BasketballDunk'}, + {'score': 0.017777055501937866, 'label': 'BabyCrawling'}, + {'score': 0.01663011871278286, 'label': 'BalanceBeam'}, + {'score': 0.009560945443809032, 'label': 'BandMarching'}, + {'score': 0.0068979403004050255, 'label': 'BaseballPitch'}] + +You can also manually replicate the results of the pipeline if you'd like. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_71.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..c11467050f3443af82024d35a3e414cfc7de45a6 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_71.txt @@ -0,0 +1,9 @@ +def run_inference(model, video): + # (num_frames, num_channels, height, width) + perumuted_sample_test_video = video.permute(1, 0, 2, 3) + inputs = { + "pixel_values": perumuted_sample_test_video.unsqueeze(0), + "labels": torch.tensor( + [sample_test_video["label"]] + ), # this can be skipped if you don't have labels available. + } \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_video_classification/chunk_72.txt b/chunked/content_aware_chunking/tasks_video_classification/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..60b1a3c6a001d1ef330cc16ebf9177e4ebbd0ccb --- /dev/null +++ b/chunked/content_aware_chunking/tasks_video_classification/chunk_72.txt @@ -0,0 +1,21 @@ +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + inputs = {k: v.to(device) for k, v in inputs.items()} + model = model.to(device) + # forward pass + with torch.no_grad(): + outputs = model(**inputs) + logits = outputs.logits + return logits + +Now, pass your input to the model and return the logits: + +logits = run_inference(trained_model, sample_test_video["video"]) + +Decoding the logits, we get: + + +predicted_class_idx = logits.argmax(-1).item() +print("Predicted class:", model.config.id2label[predicted_class_idx]) + +Predicted class: BasketballDunk +``` \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_26.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..7de0cf6f142d9553b5ab47c137f1630925b37ffc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_26.txt @@ -0,0 +1,15 @@ +Let's load the first 200 examples from the validation split and explore the dataset's features: +thon + +from datasets import load_dataset +dataset = load_dataset("Graphcore/vqa", split="validation[:200]") +dataset +Dataset({ + features: ['question', 'question_type', 'question_id', 'image_id', 'answer_type', 'label'], + num_rows: 200 +}) + +Let's take a look at an example to understand the dataset's features: + +dataset[0] +{'question': 'Where is he looking? \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_27.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d3953bcb604568885129506ce6dc493e69d2385 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_27.txt @@ -0,0 +1,21 @@ +', + 'question_type': 'none of the above', + 'question_id': 262148000, + 'image_id': '/root/.cache/huggingface/datasets/downloads/extracted/ca733e0e000fb2d7a09fbcc94dbfe7b5a30750681d0e965f8e0a23b1c2f98c75/val2014/COCO_val2014_000000262148.jpg', + 'answer_type': 'other', + 'label': {'ids': ['at table', 'down', 'skateboard', 'table'], + 'weights': [0.30000001192092896, + 1.0, + 0.30000001192092896, + 0.30000001192092896]}} + +The features relevant to the task include: +* question: the question to be answered from the image +* image_id: the path to the image the question refers to +* label: the annotations +We can remove the rest of the features as they won't be necessary: + + +dataset = dataset.remove_columns(['question_type', 'question_id', 'answer_type']) + +As you can see, the label feature contains several answers to the same question (called ids here) collected by different human annotators. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_28.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..87c894ec059b7df3ceb2bedb783bb2bf337ab23e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_28.txt @@ -0,0 +1 @@ +This is because the answer to a question can be subjective. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_29.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bbca9f0cf3601c73ddd285ab467d3982b0645fc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_29.txt @@ -0,0 +1 @@ +In this case, the question is "where is he looking?". \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_30.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f8136f408ab9db661e023409eeb25715dbcab84 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_30.txt @@ -0,0 +1,2 @@ +Some people +annotated this with "down", others with "at table", another one with "skateboard", etc. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_31.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ed1dfee71c6fa93b43c40f546a52103dd4c9dbc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_31.txt @@ -0,0 +1,9 @@ +Take a look at the image and consider which answer would you give: +thon + +from PIL import Image +image = Image.open(dataset[0]['image_id']) +image + +Due to the questions' and answers' ambiguity, datasets like this are treated as a multi-label classification problem (as +multiple answers are possibly valid). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_32.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..43fed4dd52d9e69ea090ba9816dfadca1bbc3635 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_32.txt @@ -0,0 +1,2 @@ +Moreover, rather than just creating a one-hot encoded vector, one creates a +soft encoding, based on the number of times a certain answer appeared in the annotations. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_33.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..078a858a893701578f17e23cbd421ae6ce409f4c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_33.txt @@ -0,0 +1,2 @@ +For instance, in the example above, because the answer "down" is selected way more often than other answers, it has a +score (called weight in the dataset) of 1.0, and the rest of the answers have scores < 1.0. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_34.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac7d0dddaa1665f5285abe37230fd408f36a4a30 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_34.txt @@ -0,0 +1,11 @@ +To later instantiate the model with an appropriate classification head, let's create two dictionaries: one that maps +the label name to an integer and vice versa: + +import itertools +labels = [item['ids'] for item in dataset['label']] +flattened_labels = list(itertools.chain(*labels)) +unique_labels = list(set(flattened_labels)) +label2id = {label: idx for idx, label in enumerate(unique_labels)} +id2label = {idx: label for label, idx in label2id.items()} + +Now that we have the mappings, we can replace the string answers with their ids, and flatten the dataset for a more convenient further preprocessing. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_35.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..db4bd933b17f17daf40a68d2f738a69adffa8448 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_35.txt @@ -0,0 +1,15 @@ +thon + +def replace_ids(inputs): + inputs["label"]["ids"] = [label2id[x] for x in inputs["label"]["ids"]] + return inputs +dataset = dataset.map(replace_ids) +flat_dataset = dataset.flatten() +flat_dataset.features +{'question': Value(dtype='string', id=None), + 'image_id': Value(dtype='string', id=None), + 'label.ids': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None), + 'label.weights': Sequence(feature=Value(dtype='float64', id=None), length=-1, id=None)} + +Preprocessing data +The next step is to load a ViLT processor to prepare the image and text data for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_36.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..c89bf0a082ec0453025232eaad3acea2a35e2bdb --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_36.txt @@ -0,0 +1,7 @@ +[ViltProcessor] wraps a BERT tokenizer and ViLT image processor into a convenient single processor: + + +from transformers import ViltProcessor +processor = ViltProcessor.from_pretrained(model_checkpoint) + +To preprocess the data we need to encode the images and questions using the [ViltProcessor]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_37.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fbee28cb587839afaf9d99183fc8c8240b4521a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_37.txt @@ -0,0 +1,2 @@ +The processor will use +the [BertTokenizerFast] to tokenize the text and create input_ids, attention_mask and token_type_ids for the text data. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_38.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..36f1f3257b550075390073e7ebf79cc505bc05df --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_38.txt @@ -0,0 +1 @@ +As for images, the processor will leverage [ViltImageProcessor] to resize and normalize the image, and create pixel_values and pixel_mask. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_39.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8160639ed61170816ce71325c123429953a7f6c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_39.txt @@ -0,0 +1 @@ +All these preprocessing steps are done under the hood, we only need to call the processor. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_40.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3e776c819874ab90866237f3333c6c9e424dfbc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_40.txt @@ -0,0 +1,2 @@ +However, we still need to +prepare the target labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_41.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..07af50799612c597ff8ac75272e782af44054bc5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_41.txt @@ -0,0 +1 @@ +In this representation, each element corresponds to a possible answer (label). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_42.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d92b485f2def1907331091f65ba7e846d2d0a91 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_42.txt @@ -0,0 +1,2 @@ +For correct answers, the element holds +their respective score (weight), while the remaining elements are set to zero. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_43.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9b3f9853b9ec0522178ebf4ba4e46727c6e56a4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_43.txt @@ -0,0 +1,21 @@ +The following function applies the processor to the images and questions and formats the labels as described above: + +import torch +def preprocess_data(examples): + image_paths = examples['image_id'] + images = [Image.open(image_path) for image_path in image_paths] + texts = examples['question'] + + encoding = processor(images, texts, padding="max_length", truncation=True, return_tensors="pt") + for k, v in encoding.items(): + encoding[k] = v.squeeze() + targets = [] + for labels, scores in zip(examples['label.ids'], examples['label.weights']): + target = torch.zeros(len(id2label)) + for label, score in zip(labels, scores): + target[label] = score + targets.append(target) + encoding["labels"] = targets + return encoding + +To apply the preprocessing function over the entire dataset, use 🤗 Datasets [~datasets.map] function. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_44.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..e00e14dd34732b11818deb351bd3ea4b0ac8a60b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_44.txt @@ -0,0 +1,2 @@ +You can speed up map by +setting batched=True to process multiple elements of the dataset at once. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_45.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..77469a87be687e5df67bd13276dcb1e28db14d4d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_45.txt @@ -0,0 +1 @@ +At this point, feel free to remove the columns you don't need. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_46.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..29381a53b7c1df7439f273f2fd24d5bc7e267c33 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_46.txt @@ -0,0 +1,14 @@ +processed_dataset = flat_dataset.map(preprocess_data, batched=True, remove_columns=['question','question_type', 'question_id', 'image_id', 'answer_type', 'label.ids', 'label.weights']) +processed_dataset +Dataset({ + features: ['input_ids', 'token_type_ids', 'attention_mask', 'pixel_values', 'pixel_mask', 'labels'], + num_rows: 200 +}) + +As a final step, create a batch of examples using [DefaultDataCollator]: + +from transformers import DefaultDataCollator +data_collator = DefaultDataCollator() + +Train the model +You’re ready to start training your model now! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_47.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..d14de638cc4b57a06edbf1136b71e3de02ec3386 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_47.txt @@ -0,0 +1 @@ +Load ViLT with [ViltForQuestionAnswering]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_48.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a6c921139123a24c601cb311ece84af79422bc2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_48.txt @@ -0,0 +1,25 @@ +Specify the number of labels +along with the label mappings: + +from transformers import ViltForQuestionAnswering +model = ViltForQuestionAnswering.from_pretrained(model_checkpoint, num_labels=len(id2label), id2label=id2label, label2id=label2id) + +At this point, only three steps remain: + +Define your training hyperparameters in [TrainingArguments]: + +from transformers import TrainingArguments +repo_id = "MariaK/vilt_finetuned_200" +training_args = TrainingArguments( + output_dir=repo_id, + per_device_train_batch_size=4, + num_train_epochs=20, + save_steps=200, + logging_steps=50, + learning_rate=5e-5, + save_total_limit=2, + remove_unused_columns=False, + push_to_hub=True, + ) + +Pass the training arguments to [Trainer] along with the model, dataset, processor, and data collator. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_49.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dc5fe128ac3fb35c293c1833b4c4c059afe9ce5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_49.txt @@ -0,0 +1,10 @@ +from transformers import Trainer +trainer = Trainer( + model=model, + args=training_args, + data_collator=data_collator, + train_dataset=processed_dataset, + tokenizer=processor, + ) + +Call [~Trainer.train] to finetune your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_50.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ae7dc4342fce8ebdb3378573d5f86e1ec26d5c4 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_50.txt @@ -0,0 +1,8 @@ +trainer.train() + +Once training is completed, share your model to the Hub with the [~Trainer.push_to_hub] method to share your final model on the 🤗 Hub: + +trainer.push_to_hub() + +Inference +Now that you have fine-tuned a ViLT model, and uploaded it to the 🤗 Hub, you can use it for inference. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_51.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..0da9d6025eca356799c2cb6698c647257fb6396b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_51.txt @@ -0,0 +1,2 @@ +The simplest +way to try out your fine-tuned model for inference is to use it in a [Pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_52.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..3289122a25888fe5dd6a581d8e9232a9a6bf82ca --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_52.txt @@ -0,0 +1,4 @@ +from transformers import pipeline +pipe = pipeline("visual-question-answering", model="MariaK/vilt_finetuned_200") + +The model in this guide has only been trained on 200 examples, so don't expect a lot from it. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_53.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d63b5856acd96dd8693fd67008ef3c1ed35d27e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_53.txt @@ -0,0 +1,9 @@ +Let's see if it at least +learned something from the data and take the first example from the dataset to illustrate inference: + +example = dataset[0] +image = Image.open(example['image_id']) +question = example['question'] +print(question) +pipe(image, question, top_k=1) +"Where is he looking?" \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_54.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..230257ad0dc8e42f8280df722d99237e952d1a2c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_54.txt @@ -0,0 +1,3 @@ +[{'score': 0.5498199462890625, 'answer': 'down'}] + +Even though not very confident, the model indeed has learned something. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_55.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccdc396b300c8d6539f02bc1d2dd59a05e3a848c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_55.txt @@ -0,0 +1 @@ +With more examples and longer training, you'll get far better results! \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_56.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad8aa3ecbf40116b2be2ad6990306137ee689821 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_56.txt @@ -0,0 +1,2 @@ +You can also manually replicate the results of the pipeline if you'd like: +1. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_57.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c51f9219a2f8a10c35c23572649f04bd47b3e1b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_57.txt @@ -0,0 +1 @@ +Take an image and a question, prepare them for the model using the processor from your model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_58.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_58.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_59.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..5608f0e93336431485ef9ded2fe660a0946c862f --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_59.txt @@ -0,0 +1 @@ +Forward the result or preprocessing through the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_60.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_60.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_61.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..9aa9f87ea8ccdeb78115d0e85f21dda04b75014b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_61.txt @@ -0,0 +1 @@ +From the logits, get the most likely answer's id, and find the actual answer in the id2label. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_62.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1867bf1aee5094a958dac521d6e08bfedb9476d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_62.txt @@ -0,0 +1,16 @@ +processor = ViltProcessor.from_pretrained("MariaK/vilt_finetuned_200") +image = Image.open(example['image_id']) +question = example['question'] +prepare inputs +inputs = processor(image, question, return_tensors="pt") +model = ViltForQuestionAnswering.from_pretrained("MariaK/vilt_finetuned_200") +forward pass +with torch.no_grad(): + outputs = model(**inputs) +logits = outputs.logits +idx = logits.argmax(-1).item() +print("Predicted answer:", model.config.id2label[idx]) +Predicted answer: down + +Zero-shot VQA +The previous model treated VQA as a classification task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_63.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb50be7842fde49a1e1e10d8c8ae7e1c33ac6a6d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_63.txt @@ -0,0 +1,2 @@ +Some recent models, such as BLIP, BLIP-2, and InstructBLIP approach +VQA as a generative task. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_64.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..3adb5731943d76bb1af76791c354004718bbf6fc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_64.txt @@ -0,0 +1 @@ +Let's take BLIP-2 as an example. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_65.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c64b31ac9abb2ab765b8a8d5a96e3bc54e693ce --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_65.txt @@ -0,0 +1,2 @@ +It introduced a new visual-language pre-training +paradigm in which any combination of pre-trained vision encoder and LLM can be used (learn more in the BLIP-2 blog post). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_66.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca29fc545269eac2166d730b49011028914cf29c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_66.txt @@ -0,0 +1 @@ +This enables achieving state-of-the-art results on multiple visual-language tasks including visual question answering. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_67.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdf23020df45173c0aca4e245a4e2f5674e942b2 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_67.txt @@ -0,0 +1 @@ +Let's illustrate how you can use this model for VQA. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_68.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..a079f0c96f806ab910cf90e4b12a7234269b01ac --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_68.txt @@ -0,0 +1 @@ +First, let's load the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_69.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..88b2a43e717f0faecd580ad38e5634e9969a09f5 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_69.txt @@ -0,0 +1,18 @@ +Here we'll explicitly send the model to a +GPU, if available, which we didn't need to do earlier when training, as [Trainer] handles this automatically: + +from transformers import AutoProcessor, Blip2ForConditionalGeneration +import torch +processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") +model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) +device = "cuda" if torch.cuda.is_available() else "cpu" +model.to(device) + +The model takes image and text as input, so let's use the exact same image/question pair from the first example in the VQA dataset: + + +example = dataset[0] +image = Image.open(example['image_id']) +question = example['question'] + +To use BLIP-2 for visual question answering task, the textual prompt has to follow a specific format: Question: {} Answer:. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_70.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..27e41585d4b81ed8c5eb8ea9453a1f84216ede6c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_70.txt @@ -0,0 +1,12 @@ +prompt = f"Question: {question} Answer:" + +Now we need to preprocess the image/prompt with the model's processor, pass the processed input through the model, and decode the output: + +inputs = processor(image, text=prompt, return_tensors="pt").to(device, torch.float16) +generated_ids = model.generate(**inputs, max_new_tokens=10) +generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() +print(generated_text) +"He is looking at the crowd" + +As you can see, the model recognized the crowd, and the direction of the face (looking down), however, it seems to miss +the fact the crowd is behind the skater. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_71.txt b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b74a496e88665845c37b05fc63d9953868ec9e0 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_visual_question_answering/chunk_71.txt @@ -0,0 +1,2 @@ +Still, in cases where acquiring human-annotated datasets is not feasible, this +approach can quickly produce useful results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_10.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7277690a65b5ebcbce4b8ac777f5efdcc559b7e --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_10.txt @@ -0,0 +1 @@ +The candidate labels can be simple words like in this example, or more descriptive. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_11.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f90e2458654d03e25ffc5111f783fac38481b96 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_11.txt @@ -0,0 +1,10 @@ +predictions = detector(image, candidate_labels=["fox", "bear", "seagull", "owl"]) +predictions +[{'score': 0.9996670484542847, 'label': 'owl'}, + {'score': 0.000199399160919711, 'label': 'seagull'}, + {'score': 7.392891711788252e-05, 'label': 'fox'}, + {'score': 5.96074532950297e-05, 'label': 'bear'}] + +Zero-shot image classification by hand +Now that you've seen how to use the zero-shot image classification pipeline, let's take a look how you can run zero-shot +image classification manually. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_12.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d101059fad2e8f365be3ab979e2ddcd6db418ff --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_12.txt @@ -0,0 +1 @@ +Start by loading the model and associated processor from a checkpoint on the Hugging Face Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_13.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..272570441ce79cf8e8ff79c390e76babde984376 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_13.txt @@ -0,0 +1,7 @@ +Here we'll use the same checkpoint as before: + +from transformers import AutoProcessor, AutoModelForZeroShotImageClassification +model = AutoModelForZeroShotImageClassification.from_pretrained(checkpoint) +processor = AutoProcessor.from_pretrained(checkpoint) + +Let's take a different image to switch things up. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_14.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..35c8325a453aaf7c102feb21838c0e921b1a985a --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_14.txt @@ -0,0 +1,7 @@ +from PIL import Image +import requests +url = "https://unsplash.com/photos/xBRQfR2bqNI/download?ixid=MnwxMjA3fDB8MXxhbGx8fHx8fHx8fHwxNjc4Mzg4ODEx&force=true&w=640" +image = Image.open(requests.get(url, stream=True).raw) +image + +Use the processor to prepare the inputs for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_15.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..16dc3a71180f9bb100d916aa4928d785ca360862 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_15.txt @@ -0,0 +1,2 @@ +The processor combines an image processor that prepares the +image for the model by resizing and normalizing it, and a tokenizer that takes care of the text inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_16.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..53a8911e816a6740d9358c136fc9b51c51cfccdd --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_16.txt @@ -0,0 +1,20 @@ +candidate_labels = ["tree", "car", "bike", "cat"] +inputs = processor(images=image, text=candidate_labels, return_tensors="pt", padding=True) + +Pass the inputs through the model, and post-process the results: + +import torch +with torch.no_grad(): + outputs = model(**inputs) +logits = outputs.logits_per_image[0] +probs = logits.softmax(dim=-1).numpy() +scores = probs.tolist() +result = [ + {"score": score, "label": candidate_label} + for score, candidate_label in sorted(zip(probs, candidate_labels), key=lambda x: -x[0]) + ] +result +[{'score': 0.998572, 'label': 'car'}, + {'score': 0.0010570387, 'label': 'bike'}, + {'score': 0.0003393686, 'label': 'tree'}, + {'score': 3.1572064e-05, 'label': 'cat'}] \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_5.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbb29c3455d8121d84009caa703952b3d1594295 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_5.txt @@ -0,0 +1,2 @@ +This is a more flexible approach to image classification that allows models to generalize to new and unseen categories +without the need for additional training data and enables users to query images with free-form text descriptions of their target objects . \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_6.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..752c6efb0a6436925ac48ff29019c44a73a89502 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_6.txt @@ -0,0 +1,10 @@ +In this guide you'll learn how to: + +create a zero-shot image classification pipeline +run zero-shot image classification inference by hand + +Before you begin, make sure you have all the necessary libraries installed: + +pip install -q transformers +Zero-shot image classification pipeline +The simplest way to try out inference with a model supporting zero-shot image classification is to use the corresponding [pipeline]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_7.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..f772ab58d788583c67bdc17cf2a6034da88ed549 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_7.txt @@ -0,0 +1,8 @@ +Instantiate a pipeline from a checkpoint on the Hugging Face Hub: +thon + +from transformers import pipeline +checkpoint = "openai/clip-vit-large-patch14" +detector = pipeline(model=checkpoint, task="zero-shot-image-classification") + +Next, choose an image you'd like to classify. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_8.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fe4a168125e91efcd34ed45c49ee15c8610fd22 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_8.txt @@ -0,0 +1,7 @@ +from PIL import Image +import requests +url = "https://unsplash.com/photos/g8oS8-82DxI/download?ixid=MnwxMjA3fDB8MXx0b3BpY3x8SnBnNktpZGwtSGt8fHx8fDJ8fDE2NzgxMDYwODc&force=true&w=640" +image = Image.open(requests.get(url, stream=True).raw) +image + +Pass the image and the candidate object labels to the pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_9.txt b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4c862411665fef4fc2c3ea19706e6f92fea3ade --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_image_classification/chunk_9.txt @@ -0,0 +1,2 @@ +Here we pass the image directly; other suitable options +include a local path to an image or an image url. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_13.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..f74109be14105fdb5cee9c6858c7f014aa82b99b --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_13.txt @@ -0,0 +1,8 @@ +import skimage +import numpy as np +from PIL import Image +image = skimage.data.astronaut() +image = Image.fromarray(np.uint8(image)).convert("RGB") +image + +Pass the image and the candidate object labels to look for to the pipeline. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_14.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7e257c6fb0b2ad9ebfc8063e5769e5f4d615a31 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_14.txt @@ -0,0 +1 @@ +Here we pass the image directly; other suitable options include a local path to an image or an image url. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_15.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e166fb6929f0db3a390ad380dacbbd590b2d660 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_15.txt @@ -0,0 +1 @@ +We also pass text descriptions for all items we want to query the image for. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_16.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5eaa2e3d0201bd13f60ebfcd26a5ddd45600319 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_16.txt @@ -0,0 +1,42 @@ +predictions = detector( + image, + candidate_labels=["human face", "rocket", "nasa badge", "star-spangled banner"], + ) +predictions +[{'score': 0.3571370542049408, + 'label': 'human face', + 'box': {'xmin': 180, 'ymin': 71, 'xmax': 271, 'ymax': 178}}, + {'score': 0.28099656105041504, + 'label': 'nasa badge', + 'box': {'xmin': 129, 'ymin': 348, 'xmax': 206, 'ymax': 427}}, + {'score': 0.2110239565372467, + 'label': 'rocket', + 'box': {'xmin': 350, 'ymin': -1, 'xmax': 468, 'ymax': 288}}, + {'score': 0.13790413737297058, + 'label': 'star-spangled banner', + 'box': {'xmin': 1, 'ymin': 1, 'xmax': 105, 'ymax': 509}}, + {'score': 0.11950037628412247, + 'label': 'nasa badge', + 'box': {'xmin': 277, 'ymin': 338, 'xmax': 327, 'ymax': 380}}, + {'score': 0.10649408400058746, + 'label': 'rocket', + 'box': {'xmin': 358, 'ymin': 64, 'xmax': 424, 'ymax': 280}}] + +Let's visualize the predictions: + +from PIL import ImageDraw +draw = ImageDraw.Draw(image) +for prediction in predictions: + box = prediction["box"] + label = prediction["label"] + score = prediction["score"] + + xmin, ymin, xmax, ymax = box.values() + draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1) + draw.text((xmin, ymin), f"{label}: {round(score,2)}", fill="white") + +image + +Text-prompted zero-shot object detection by hand +Now that you've seen how to use the zero-shot object detection pipeline, let's replicate the same +result manually. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_17.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d101059fad2e8f365be3ab979e2ddcd6db418ff --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_17.txt @@ -0,0 +1 @@ +Start by loading the model and associated processor from a checkpoint on the Hugging Face Hub. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_18.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a24fc42f142c66afe2b6652cfd7776a451a8b27 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_18.txt @@ -0,0 +1,7 @@ +Here we'll use the same checkpoint as before: + +from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection +model = AutoModelForZeroShotObjectDetection.from_pretrained(checkpoint) +processor = AutoProcessor.from_pretrained(checkpoint) + +Let's take a different image to switch things up. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_19.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..477ed5a4fbdd23455d9c1abb8f1a82aa2460dadc --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_19.txt @@ -0,0 +1,6 @@ +import requests +url = "https://unsplash.com/photos/oj0zeY2Ltk4/download?ixid=MnwxMjA3fDB8MXxzZWFyY2h8MTR8fHBpY25pY3xlbnwwfHx8fDE2Nzc0OTE1NDk&force=true&w=640" +im = Image.open(requests.get(url, stream=True).raw) +im + +Use the processor to prepare the inputs for the model. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_20.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d455ec8d40e5ed097ac6e91a96cff0236ee92f8 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_20.txt @@ -0,0 +1,2 @@ +The processor combines an image processor that prepares the +image for the model by resizing and normalizing it, and a [CLIPTokenizer] that takes care of the text inputs. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_21.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..28f4e66425cac37a2f009a101f4f491522d5c055 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_21.txt @@ -0,0 +1,4 @@ +text_queries = ["hat", "book", "sunglasses", "camera"] +inputs = processor(text=text_queries, images=im, return_tensors="pt") + +Pass the inputs through the model, post-process, and visualize the results. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_22.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..005d04171588dab0fd84103d0e7d07351bce78da --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_22.txt @@ -0,0 +1,21 @@ +Since the image processor resized images before +feeding them to the model, you need to use the [~OwlViTImageProcessor.post_process_object_detection] method to make sure the predicted bounding +boxes have the correct coordinates relative to the original image: + +import torch +with torch.no_grad(): + outputs = model(**inputs) + target_sizes = torch.tensor([im.size[::-1]]) + results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes)[0] +draw = ImageDraw.Draw(im) +scores = results["scores"].tolist() +labels = results["labels"].tolist() +boxes = results["boxes"].tolist() +for box, score, label in zip(boxes, scores, labels): + xmin, ymin, xmax, ymax = box + draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1) + draw.text((xmin, ymin), f"{text_queries[label]}: {round(score,2)}", fill="white") +im + +Batch processing +You can pass multiple sets of images and text queries to search for different (or same) objects in several images. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_23.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf398af8d56205e3b28d1c94cd6b86e0f3f4dd19 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_23.txt @@ -0,0 +1 @@ +Let's use both an astronaut image and the beach image together. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_24.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ac052d028dd58a05e825e66a97f7d55a5877c43 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_24.txt @@ -0,0 +1,2 @@ +For batch processing, you should pass text queries as a nested list to the processor and images as lists of PIL images, +PyTorch tensors, or NumPy arrays. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_25.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..d578555a97e39be7ad6f15f894bc0fc5a1d98648 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_25.txt @@ -0,0 +1,9 @@ +images = [image, im] +text_queries = [ + ["human face", "rocket", "nasa badge", "star-spangled banner"], + ["hat", "book", "sunglasses", "camera"], + ] +inputs = processor(text=text_queries, images=images, return_tensors="pt") + +Previously for post-processing you passed the single image's size as a tensor, but you can also pass a tuple, or, in case +of several images, a list of tuples. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_26.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d96466288f54b718093c0f3317a0c20c92c5a78 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_26.txt @@ -0,0 +1 @@ +Let's create predictions for the two examples, and visualize the second one (image_idx = 1). \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_27.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..580f4f9b2785822009ae4fba0be8b23917c11e4c --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_27.txt @@ -0,0 +1,17 @@ +with torch.no_grad(): + outputs = model(**inputs) + target_sizes = [x.size[::-1] for x in images] + results = processor.post_process_object_detection(outputs, threshold=0.1, target_sizes=target_sizes) +image_idx = 1 +draw = ImageDraw.Draw(images[image_idx]) +scores = results[image_idx]["scores"].tolist() +labels = results[image_idx]["labels"].tolist() +boxes = results[image_idx]["boxes"].tolist() +for box, score, label in zip(boxes, scores, labels): + xmin, ymin, xmax, ymax = box + draw.rectangle((xmin, ymin, xmax, ymax), outline="red", width=1) + draw.text((xmin, ymin), f"{text_queries[image_idx][label]}: {round(score,2)}", fill="white") +images[image_idx] + +Image-guided object detection +In addition to zero-shot object detection with text queries, OWL-ViT offers image-guided object detection. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_28.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8ce50ac2859087f5cb0fca11dff065eeb87de08 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_28.txt @@ -0,0 +1,2 @@ +This means +you can use an image query to find similar objects in the target image. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_29.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..88a8433f19d1e4a9353b5387e9cb5c84f154c0e7 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_29.txt @@ -0,0 +1 @@ +Unlike text queries, only a single example image is allowed. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_30.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e4fcd7d4da807037caa8cb4bb1813a432feb58d --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_30.txt @@ -0,0 +1,20 @@ +Let's take an image with two cats on a couch as a target image, and an image of a single cat +as a query: + +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image_target = Image.open(requests.get(url, stream=True).raw) +query_url = "http://images.cocodataset.org/val2017/000000524280.jpg" +query_image = Image.open(requests.get(query_url, stream=True).raw) + +Let's take a quick look at the images: + +import matplotlib.pyplot as plt +fig, ax = plt.subplots(1, 2) +ax[0].imshow(image_target) +ax[1].imshow(query_image) + +In the preprocessing step, instead of text queries, you now need to use query_images: + +inputs = processor(images=image_target, query_images=query_image, return_tensors="pt") + +For predictions, instead of passing the inputs to the model, pass them to [~OwlViTForObjectDetection.image_guided_detection]. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_31.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbea6db2714e3a26e9f39467f79989ce63808b80 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_31.txt @@ -0,0 +1,2 @@ +Draw the predictions +as before except now there are no labels. \ No newline at end of file diff --git a/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_32.txt b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5b68cd2baf105a37d15035da7d478fc7c749302 --- /dev/null +++ b/chunked/content_aware_chunking/tasks_zero_shot_object_detection/chunk_32.txt @@ -0,0 +1,11 @@ +with torch.no_grad(): + outputs = model.image_guided_detection(**inputs) + target_sizes = torch.tensor([image_target.size[::-1]]) + results = processor.post_process_image_guided_detection(outputs=outputs, target_sizes=target_sizes)[0] +draw = ImageDraw.Draw(image_target) +scores = results["scores"].tolist() +boxes = results["boxes"].tolist() +for box, score, label in zip(boxes, scores, labels): + xmin, ymin, xmax, ymax = box + draw.rectangle((xmin, ymin, xmax, ymax), outline="white", width=4) +image_target \ No newline at end of file diff --git a/chunked/nltk_chunking/__config/chunk_0.txt b/chunked/nltk_chunking/__config/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..c70d06718919dcf7feea58eafd40130774f52088 --- /dev/null +++ b/chunked/nltk_chunking/__config/chunk_0.txt @@ -0,0 +1,4 @@ +docstyle-ignore +INSTALL_CONTENT = """ +Transformers installation +! \ No newline at end of file diff --git a/chunked/nltk_chunking/__config/chunk_1.txt b/chunked/nltk_chunking/__config/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..570746a8d53645c1a5f75632488aa36c318f718f --- /dev/null +++ b/chunked/nltk_chunking/__config/chunk_1.txt @@ -0,0 +1,2 @@ +pip install transformers datasets +To install from source instead of the last release, comment the command above and uncomment the following one. \ No newline at end of file diff --git a/chunked/nltk_chunking/__config/chunk_2.txt b/chunked/nltk_chunking/__config/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..74e0f12e3246e5d0b556558359a30e0991092cdc --- /dev/null +++ b/chunked/nltk_chunking/__config/chunk_2.txt @@ -0,0 +1 @@ +! \ No newline at end of file diff --git a/chunked/nltk_chunking/__config/chunk_3.txt b/chunked/nltk_chunking/__config/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1c2b553afc68a2cfc0a086df93969ea53775ed0 --- /dev/null +++ b/chunked/nltk_chunking/__config/chunk_3.txt @@ -0,0 +1,8 @@ +pip install git+https://github.com/huggingface/transformers.git +""" +notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}] +black_avoid_patterns = { + "{processor_class}": "FakeProcessorClass", + "{model_class}": "FakeModelClass", + "{object_class}": "FakeObjectClass", +} \ No newline at end of file diff --git a/chunked/nltk_chunking/__redirects/chunk_0.txt b/chunked/nltk_chunking/__redirects/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..c796d74659bbb4d3879037d20de06d01402b184c --- /dev/null +++ b/chunked/nltk_chunking/__redirects/chunk_0.txt @@ -0,0 +1,2 @@ +Optimizing inference +perf_infer_gpu_many: perf_infer_gpu_one \ No newline at end of file diff --git a/chunked/nltk_chunking/__toctree/chunk_0.txt b/chunked/nltk_chunking/__toctree/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a88c156ea798dfc7bab6c01412469153b203f71 --- /dev/null +++ b/chunked/nltk_chunking/__toctree/chunk_0.txt @@ -0,0 +1,192 @@ + +sections: +local: index + title: 🤗 Transformers +local: quicktour + title: Quick tour +local: installation + title: Installation + title: Get started +sections: +local: pipeline_tutorial + title: Run inference with pipelines +local: autoclass_tutorial + title: Write portable code with AutoClass +local: preprocessing + title: Preprocess data +local: training + title: Fine-tune a pretrained model +local: run_scripts + title: Train with a script +local: accelerate + title: Set up distributed training with 🤗 Accelerate +local: peft + title: Load and train adapters with 🤗 PEFT +local: model_sharing + title: Share your model +local: transformers_agents + title: Agents +local: llm_tutorial + title: Generation with LLMs + title: Tutorials +sections: +isExpanded: false + sections: +local: tasks/sequence_classification + title: Text classification +local: tasks/token_classification + title: Token classification +local: tasks/question_answering + title: Question answering +local: tasks/language_modeling + title: Causal language modeling +local: tasks/masked_language_modeling + title: Masked language modeling +local: tasks/translation + title: Translation +local: tasks/summarization + title: Summarization +local: tasks/multiple_choice + title: Multiple choice +title: Natural Language Processing + +isExpanded: false + sections: +local: tasks/audio_classification + title: Audio classification +local: tasks/asr + title: Automatic speech recognition +title: Audio + +isExpanded: false + sections: +local: tasks/image_classification + title: Image classification +local: tasks/semantic_segmentation + title: Image segmentation +local: tasks/video_classification + title: Video classification +local: tasks/object_detection + title: Object detection +local: tasks/zero_shot_object_detection + title: Zero-shot object detection +local: tasks/zero_shot_image_classification + title: Zero-shot image classification +local: tasks/monocular_depth_estimation + title: Depth estimation +local: tasks/image_to_image + title: Image-to-Image +local: tasks/mask_generation + title: Mask Generation +local: tasks/knowledge_distillation_for_image_classification + title: Knowledge Distillation for Computer Vision +title: Computer Vision + +isExpanded: false + sections: +local: tasks/image_captioning + title: Image captioning +local: tasks/document_question_answering + title: Document Question Answering +local: tasks/visual_question_answering + title: Visual Question Answering +local: tasks/text-to-speech + title: Text to speech +title: Multimodal + +isExpanded: false + sections: +local: generation_strategies + title: Customize the generation strategy +title: Generation + +isExpanded: false + sections: +local: tasks/idefics + title: Image tasks with IDEFICS +local: tasks/prompting + title: LLM prompting guide +title: Prompting + title: Task Guides + +sections: +local: fast_tokenizers + title: Use fast tokenizers from 🤗 Tokenizers +local: multilingual + title: Run inference with multilingual models +local: create_a_model + title: Use model-specific APIs +local: custom_models + title: Share a custom model +local: chat_templating + title: Templates for chat models +local: trainer + title: Trainer +local: sagemaker + title: Run training on Amazon SageMaker +local: serialization + title: Export to ONNX +local: tflite + title: Export to TFLite +local: torchscript + title: Export to TorchScript +local: benchmarks + title: Benchmarks +local: notebooks + title: Notebooks with examples +local: community + title: Community resources +local: custom_tools + title: Custom Tools and Prompts +local: troubleshooting + title: Troubleshoot +local: hf_quantizer + title: Contribute new quantization method + title: Developer guides +sections: +local: performance + title: Overview +local: quantization + title: Quantization +sections: +local: perf_train_gpu_one + title: Methods and tools for efficient training on a single GPU +local: perf_train_gpu_many + title: Multiple GPUs and parallelism +local: fsdp + title: Fully Sharded Data Parallel +local: deepspeed + title: DeepSpeed +local: perf_train_cpu + title: Efficient training on CPU +local: perf_train_cpu_many + title: Distributed CPU training +local: perf_train_tpu_tf + title: Training on TPU with TensorFlow +local: perf_train_special + title: PyTorch training on Apple silicon +local: perf_hardware + title: Custom hardware for training +local: hpo_train + title: Hyperparameter Search using Trainer API +title: Efficient training techniques + +sections: +local: perf_infer_cpu + title: CPU inference +local: perf_infer_gpu_one + title: GPU inference +title: Optimizing inference + +local: big_models + title: Instantiating a big model +local: debugging + title: Debugging +local: tf_xla + title: XLA Integration for TensorFlow Models +local: perf_torch_compile + title: Optimize inference using torch.compile() + title: Performance and scalability +sections: +local: contributing + title: How to contribute to 🤗 Transformers? \ No newline at end of file diff --git a/chunked/nltk_chunking/__toctree/chunk_1.txt b/chunked/nltk_chunking/__toctree/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d087208302a6ac7277cba1aef8ef1af9b14aa22 --- /dev/null +++ b/chunked/nltk_chunking/__toctree/chunk_1.txt @@ -0,0 +1,2 @@ +local: add_new_model + title: How to add a model to 🤗 Transformers? \ No newline at end of file diff --git a/chunked/nltk_chunking/__toctree/chunk_2.txt b/chunked/nltk_chunking/__toctree/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..76209eea7470e02152e059347698b4328102f5ab --- /dev/null +++ b/chunked/nltk_chunking/__toctree/chunk_2.txt @@ -0,0 +1,2 @@ +local: add_tensorflow_model + title: How to convert a 🤗 Transformers model to TensorFlow? \ No newline at end of file diff --git a/chunked/nltk_chunking/__toctree/chunk_3.txt b/chunked/nltk_chunking/__toctree/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ead8ca274b89f5fff27f905d7eb8d7c05c7b9aa --- /dev/null +++ b/chunked/nltk_chunking/__toctree/chunk_3.txt @@ -0,0 +1,2 @@ +local: add_new_pipeline + title: How to add a pipeline to 🤗 Transformers? \ No newline at end of file diff --git a/chunked/nltk_chunking/__toctree/chunk_4.txt b/chunked/nltk_chunking/__toctree/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..7daf7c1d01b3104518e9a1ee6869ea21cf35d7d5 --- /dev/null +++ b/chunked/nltk_chunking/__toctree/chunk_4.txt @@ -0,0 +1,637 @@ +local: testing + title: Testing +local: pr_checks + title: Checks on a Pull Request + title: Contribute +sections: +local: philosophy + title: Philosophy +local: glossary + title: Glossary +local: task_summary + title: What 🤗 Transformers can do +local: tasks_explained + title: How 🤗 Transformers solve tasks +local: model_summary + title: The Transformer model family +local: tokenizer_summary + title: Summary of the tokenizers +local: attention + title: Attention mechanisms +local: pad_truncation + title: Padding and truncation +local: bertology + title: BERTology +local: perplexity + title: Perplexity of fixed-length models +local: pipeline_webserver + title: Pipelines for webserver inference +local: model_memory_anatomy + title: Model training anatomy +local: llm_tutorial_optimization + title: Getting the most out of LLMs + title: Conceptual guides +sections: +sections: +local: main_classes/agent + title: Agents and Tools +local: model_doc/auto + title: Auto Classes +local: main_classes/backbones + title: Backbones +local: main_classes/callback + title: Callbacks +local: main_classes/configuration + title: Configuration +local: main_classes/data_collator + title: Data Collator +local: main_classes/keras_callbacks + title: Keras callbacks +local: main_classes/logging + title: Logging +local: main_classes/model + title: Models +local: main_classes/text_generation + title: Text Generation +local: main_classes/onnx + title: ONNX +local: main_classes/optimizer_schedules + title: Optimization +local: main_classes/output + title: Model outputs +local: main_classes/pipelines + title: Pipelines +local: main_classes/processors + title: Processors +local: main_classes/quantization + title: Quantization +local: main_classes/tokenizer + title: Tokenizer +local: main_classes/trainer + title: Trainer +local: main_classes/deepspeed + title: DeepSpeed +local: main_classes/feature_extractor + title: Feature Extractor +local: main_classes/image_processor + title: Image Processor +title: Main Classes + +sections: +isExpanded: false + sections: +local: model_doc/albert + title: ALBERT +local: model_doc/bart + title: BART +local: model_doc/barthez + title: BARThez +local: model_doc/bartpho + title: BARTpho +local: model_doc/bert + title: BERT +local: model_doc/bert-generation + title: BertGeneration +local: model_doc/bert-japanese + title: BertJapanese +local: model_doc/bertweet + title: Bertweet +local: model_doc/big_bird + title: BigBird +local: model_doc/bigbird_pegasus + title: BigBirdPegasus +local: model_doc/biogpt + title: BioGpt +local: model_doc/blenderbot + title: Blenderbot +local: model_doc/blenderbot-small + title: Blenderbot Small +local: model_doc/bloom + title: BLOOM +local: model_doc/bort + title: BORT +local: model_doc/byt5 + title: ByT5 +local: model_doc/camembert + title: CamemBERT +local: model_doc/canine + title: CANINE +local: model_doc/codegen + title: CodeGen +local: model_doc/code_llama + title: CodeLlama +local: model_doc/convbert + title: ConvBERT +local: model_doc/cpm + title: CPM +local: model_doc/cpmant + title: CPMANT +local: model_doc/ctrl + title: CTRL +local: model_doc/deberta + title: DeBERTa +local: model_doc/deberta-v2 + title: DeBERTa-v2 +local: model_doc/dialogpt + title: DialoGPT +local: model_doc/distilbert + title: DistilBERT +local: model_doc/dpr + title: DPR +local: model_doc/electra + title: ELECTRA +local: model_doc/encoder-decoder + title: Encoder Decoder Models +local: model_doc/ernie + title: ERNIE +local: model_doc/ernie_m + title: ErnieM +local: model_doc/esm + title: ESM +local: model_doc/falcon + title: Falcon +local: model_doc/fastspeech2_conformer + title: FastSpeech2Conformer +local: model_doc/flan-t5 + title: FLAN-T5 +local: model_doc/flan-ul2 + title: FLAN-UL2 +local: model_doc/flaubert + title: FlauBERT +local: model_doc/fnet + title: FNet +local: model_doc/fsmt + title: FSMT +local: model_doc/funnel + title: Funnel Transformer +local: model_doc/fuyu + title: Fuyu +local: model_doc/openai-gpt + title: GPT +local: model_doc/gpt_neo + title: GPT Neo +local: model_doc/gpt_neox + title: GPT NeoX +local: model_doc/gpt_neox_japanese + title: GPT NeoX Japanese +local: model_doc/gptj + title: GPT-J +local: model_doc/gpt2 + title: GPT2 +local: model_doc/gpt_bigcode + title: GPTBigCode +local: model_doc/gptsan-japanese + title: GPTSAN Japanese +local: model_doc/gpt-sw3 + title: GPTSw3 +local: model_doc/herbert + title: HerBERT +local: model_doc/ibert + title: I-BERT +local: model_doc/jukebox + title: Jukebox +local: model_doc/led + title: LED +local: model_doc/llama + title: LLaMA +local: model_doc/llama2 + title: Llama2 +local: model_doc/longformer + title: Longformer +local: model_doc/longt5 + title: LongT5 +local: model_doc/luke + title: LUKE +local: model_doc/m2m_100 + title: M2M100 +local: model_doc/madlad-400 + title: MADLAD-400 +local: model_doc/marian + title: MarianMT +local: model_doc/markuplm + title: MarkupLM +local: model_doc/mbart + title: MBart and MBart-50 +local: model_doc/mega + title: MEGA +local: model_doc/megatron-bert + title: MegatronBERT +local: model_doc/megatron_gpt2 + title: MegatronGPT2 +local: model_doc/mistral + title: Mistral +local: model_doc/mixtral + title: Mixtral +local: model_doc/mluke + title: mLUKE +local: model_doc/mobilebert + title: MobileBERT +local: model_doc/mpnet + title: MPNet +local: model_doc/mpt + title: MPT +local: model_doc/mra + title: MRA +local: model_doc/mt5 + title: MT5 +local: model_doc/mvp + title: MVP +local: model_doc/nezha + title: NEZHA +local: model_doc/nllb + title: NLLB +local: model_doc/nllb-moe + title: NLLB-MoE +local: model_doc/nystromformer + title: Nyströmformer +local: model_doc/open-llama + title: Open-Llama +local: model_doc/opt + title: OPT +local: model_doc/pegasus + title: Pegasus +local: model_doc/pegasus_x + title: PEGASUS-X +local: model_doc/persimmon + title: Persimmon +local: model_doc/phi + title: Phi +local: model_doc/phobert + title: PhoBERT +local: model_doc/plbart + title: PLBart +local: model_doc/prophetnet + title: ProphetNet +local: model_doc/qdqbert + title: QDQBert +local: model_doc/qwen2 + title: Qwen2 +local: model_doc/rag + title: RAG +local: model_doc/realm + title: REALM +local: model_doc/reformer + title: Reformer +local: model_doc/rembert + title: RemBERT +local: model_doc/retribert + title: RetriBERT +local: model_doc/roberta + title: RoBERTa +local: model_doc/roberta-prelayernorm + title: RoBERTa-PreLayerNorm +local: model_doc/roc_bert + title: RoCBert +local: model_doc/roformer + title: RoFormer +local: model_doc/rwkv + title: RWKV +local: model_doc/splinter + title: Splinter +local: model_doc/squeezebert + title: SqueezeBERT +local: model_doc/stablelm + title: StableLm +local: model_doc/switch_transformers + title: SwitchTransformers +local: model_doc/t5 + title: T5 +local: model_doc/t5v1.1 + title: T5v1.1 +local: model_doc/tapex + title: TAPEX +local: model_doc/transfo-xl + title: Transformer XL +local: model_doc/ul2 + title: UL2 +local: model_doc/umt5 + title: UMT5 +local: model_doc/xmod + title: X-MOD +local: model_doc/xglm + title: XGLM +local: model_doc/xlm + title: XLM +local: model_doc/xlm-prophetnet + title: XLM-ProphetNet +local: model_doc/xlm-roberta + title: XLM-RoBERTa +local: model_doc/xlm-roberta-xl + title: XLM-RoBERTa-XL +local: model_doc/xlm-v + title: XLM-V +local: model_doc/xlnet + title: XLNet +local: model_doc/yoso + title: YOSO + title: Text models +isExpanded: false + sections: +local: model_doc/beit + title: BEiT +local: model_doc/bit + title: BiT +local: model_doc/conditional_detr + title: Conditional DETR +local: model_doc/convnext + title: ConvNeXT +local: model_doc/convnextv2 + title: ConvNeXTV2 +local: model_doc/cvt + title: CvT +local: model_doc/deformable_detr + title: Deformable DETR +local: model_doc/deit + title: DeiT +local: model_doc/depth_anything + title: Depth Anything +local: model_doc/deta + title: DETA +local: model_doc/detr + title: DETR +local: model_doc/dinat + title: DiNAT +local: model_doc/dinov2 + title: DINOV2 +local: model_doc/dit + title: DiT +local: model_doc/dpt + title: DPT +local: model_doc/efficientformer + title: EfficientFormer +local: model_doc/efficientnet + title: EfficientNet +local: model_doc/focalnet + title: FocalNet +local: model_doc/glpn + title: GLPN +local: model_doc/imagegpt + title: ImageGPT +local: model_doc/levit + title: LeViT +local: model_doc/mask2former + title: Mask2Former +local: model_doc/maskformer + title: MaskFormer +local: model_doc/mobilenet_v1 + title: MobileNetV1 +local: model_doc/mobilenet_v2 + title: MobileNetV2 +local: model_doc/mobilevit + title: MobileViT +local: model_doc/mobilevitv2 + title: MobileViTV2 +local: model_doc/nat + title: NAT +local: model_doc/poolformer + title: PoolFormer +local: model_doc/pvt + title: Pyramid Vision Transformer (PVT) +local: model_doc/regnet + title: RegNet +local: model_doc/resnet + title: ResNet +local: model_doc/segformer + title: SegFormer +local: model_doc/swiftformer + title: SwiftFormer +local: model_doc/swin + title: Swin Transformer +local: model_doc/swinv2 + title: Swin Transformer V2 +local: model_doc/swin2sr + title: Swin2SR +local: model_doc/table-transformer + title: Table Transformer +local: model_doc/upernet + title: UperNet +local: model_doc/van + title: VAN +local: model_doc/vit + title: Vision Transformer (ViT) +local: model_doc/vit_hybrid + title: ViT Hybrid +local: model_doc/vitdet + title: ViTDet +local: model_doc/vit_mae + title: ViTMAE +local: model_doc/vitmatte + title: ViTMatte +local: model_doc/vit_msn + title: ViTMSN +local: model_doc/yolos + title: YOLOS + title: Vision models +isExpanded: false + sections: +local: model_doc/audio-spectrogram-transformer + title: Audio Spectrogram Transformer +local: model_doc/bark + title: Bark +local: model_doc/clap + title: CLAP +local: model_doc/encodec + title: EnCodec +local: model_doc/hubert + title: Hubert +local: model_doc/mctct + title: MCTCT +local: model_doc/mms + title: MMS +local: model_doc/musicgen + title: MusicGen +local: model_doc/pop2piano + title: Pop2Piano +local: model_doc/seamless_m4t + title: Seamless-M4T +local: model_doc/seamless_m4t_v2 + title: SeamlessM4T-v2 +local: model_doc/sew + title: SEW +local: model_doc/sew-d + title: SEW-D +local: model_doc/speech_to_text + title: Speech2Text +local: model_doc/speech_to_text_2 + title: Speech2Text2 +local: model_doc/speecht5 + title: SpeechT5 +local: model_doc/unispeech + title: UniSpeech +local: model_doc/unispeech-sat + title: UniSpeech-SAT +local: model_doc/univnet + title: UnivNet +local: model_doc/vits + title: VITS +local: model_doc/wav2vec2 + title: Wav2Vec2 +local: model_doc/wav2vec2-bert + title: Wav2Vec2-BERT +local: model_doc/wav2vec2-conformer + title: Wav2Vec2-Conformer +local: model_doc/wav2vec2_phoneme + title: Wav2Vec2Phoneme +local: model_doc/wavlm + title: WavLM +local: model_doc/whisper + title: Whisper +local: model_doc/xls_r + title: XLS-R +local: model_doc/xlsr_wav2vec2 + title: XLSR-Wav2Vec2 + title: Audio models +isExpanded: false + sections: +local: model_doc/timesformer + title: TimeSformer +local: model_doc/videomae + title: VideoMAE +local: model_doc/vivit + title: ViViT + title: Video models +isExpanded: false + sections: +local: model_doc/align + title: ALIGN +local: model_doc/altclip + title: AltCLIP +local: model_doc/blip + title: BLIP +local: model_doc/blip-2 + title: BLIP-2 +local: model_doc/bridgetower + title: BridgeTower +local: model_doc/bros + title: BROS +local: model_doc/chinese_clip + title: Chinese-CLIP +local: model_doc/clip + title: CLIP +local: model_doc/clipseg + title: CLIPSeg +local: model_doc/clvp + title: CLVP +local: model_doc/data2vec + title: Data2Vec +local: model_doc/deplot + title: DePlot +local: model_doc/donut + title: Donut +local: model_doc/flava + title: FLAVA +local: model_doc/git + title: GIT +local: model_doc/groupvit + title: GroupViT +local: model_doc/idefics + title: IDEFICS +local: model_doc/instructblip + title: InstructBLIP +local: model_doc/kosmos-2 + title: KOSMOS-2 +local: model_doc/layoutlm + title: LayoutLM +local: model_doc/layoutlmv2 + title: LayoutLMV2 +local: model_doc/layoutlmv3 + title: LayoutLMV3 +local: model_doc/layoutxlm + title: LayoutXLM +local: model_doc/lilt + title: LiLT +local: model_doc/llava + title: Llava +local: model_doc/lxmert + title: LXMERT +local: model_doc/matcha + title: MatCha +local: model_doc/mgp-str + title: MGP-STR +local: model_doc/nougat + title: Nougat +local: model_doc/oneformer + title: OneFormer +local: model_doc/owlvit + title: OWL-ViT +local: model_doc/owlv2 + title: OWLv2 +local: model_doc/perceiver + title: Perceiver +local: model_doc/pix2struct + title: Pix2Struct +local: model_doc/sam + title: Segment Anything +local: model_doc/siglip + title: SigLIP +local: model_doc/speech-encoder-decoder + title: Speech Encoder Decoder Models +local: model_doc/tapas + title: TAPAS +local: model_doc/trocr + title: TrOCR +local: model_doc/tvlt + title: TVLT +local: model_doc/tvp + title: TVP +local: model_doc/vilt + title: ViLT +local: model_doc/vipllava + title: VipLlava +local: model_doc/vision-encoder-decoder + title: Vision Encoder Decoder Models +local: model_doc/vision-text-dual-encoder + title: Vision Text Dual Encoder +local: model_doc/visual_bert + title: VisualBERT +local: model_doc/xclip + title: X-CLIP + title: Multimodal models +isExpanded: false + sections: +local: model_doc/decision_transformer + title: Decision Transformer +local: model_doc/trajectory_transformer + title: Trajectory Transformer + title: Reinforcement learning models +isExpanded: false + sections: +local: model_doc/autoformer + title: Autoformer +local: model_doc/informer + title: Informer +local: model_doc/patchtsmixer + title: PatchTSMixer +local: model_doc/patchtst + title: PatchTST +local: model_doc/time_series_transformer + title: Time Series Transformer + title: Time series models +isExpanded: false + sections: +local: model_doc/graphormer + title: Graphormer + title: Graph models +title: Models + +sections: +local: internal/modeling_utils + title: Custom Layers and Utilities +local: internal/pipelines_utils + title: Utilities for pipelines +local: internal/tokenization_utils + title: Utilities for Tokenizers +local: internal/trainer_utils + title: Utilities for Trainer +local: internal/generation_utils + title: Utilities for Generation +local: internal/image_processing_utils + title: Utilities for Image Processors +local: internal/audio_utils + title: Utilities for Audio processing +local: internal/file_utils + title: General Utilities +local: internal/time_series_utils + title: Utilities for Time Series +title: Internal Helpers + title: API \ No newline at end of file diff --git a/chunked/nltk_chunking/_accelerate/chunk_0.txt b/chunked/nltk_chunking/_accelerate/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfeb3f7c4c3127680d6ebfe5c04094b30f98de46 --- /dev/null +++ b/chunked/nltk_chunking/_accelerate/chunk_0.txt @@ -0,0 +1,3 @@ + +Distributed training with 🤗 Accelerate +As models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. \ No newline at end of file diff --git a/chunked/nltk_chunking/_accelerate/chunk_1.txt b/chunked/nltk_chunking/_accelerate/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0efdf84d0d10e49abfbe6ccdb6a28febd150f3e --- /dev/null +++ b/chunked/nltk_chunking/_accelerate/chunk_1.txt @@ -0,0 +1 @@ +At Hugging Face, we created the 🤗 Accelerate library to help users easily train a 🤗 Transformers model on any type of distributed setup, whether it is multiple GPU's on one machine or multiple GPU's across several machines. \ No newline at end of file diff --git a/chunked/nltk_chunking/_accelerate/chunk_10.txt b/chunked/nltk_chunking/_accelerate/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f6efca4cf2a17c212af67ae129dde6a33a97a2e --- /dev/null +++ b/chunked/nltk_chunking/_accelerate/chunk_10.txt @@ -0,0 +1,6 @@ +Wrap all the code responsible for training in a function, and pass it to [~accelerate.notebook_launcher]: + +from accelerate import notebook_launcher +notebook_launcher(training_function) + +For more information about 🤗 Accelerate and its rich features, refer to the documentation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_accelerate/chunk_2.txt b/chunked/nltk_chunking/_accelerate/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ecb16e2394c8c07cc858062b5fe12da6abd8e85 --- /dev/null +++ b/chunked/nltk_chunking/_accelerate/chunk_2.txt @@ -0,0 +1 @@ +In this tutorial, learn how to customize your native PyTorch training loop to enable training in a distributed environment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_accelerate/chunk_3.txt b/chunked/nltk_chunking/_accelerate/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a9ec9b46bf42d9df2bbbd2e1822a3b93d85bf09 --- /dev/null +++ b/chunked/nltk_chunking/_accelerate/chunk_3.txt @@ -0,0 +1,5 @@ +Setup +Get started by installing 🤗 Accelerate: + +pip install accelerate +Then import and create an [~accelerate.Accelerator] object. \ No newline at end of file diff --git a/chunked/nltk_chunking/_accelerate/chunk_4.txt b/chunked/nltk_chunking/_accelerate/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3c762fd88e241ddf457f3ca9f0624447f0007d7 --- /dev/null +++ b/chunked/nltk_chunking/_accelerate/chunk_4.txt @@ -0,0 +1 @@ +The [~accelerate.Accelerator] will automatically detect your type of distributed setup and initialize all the necessary components for training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_accelerate/chunk_5.txt b/chunked/nltk_chunking/_accelerate/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f9fb5c5715fdbf9352d1712e1f9dbc813db7f00 --- /dev/null +++ b/chunked/nltk_chunking/_accelerate/chunk_5.txt @@ -0,0 +1 @@ +You don't need to explicitly place your model on a device. \ No newline at end of file diff --git a/chunked/nltk_chunking/_accelerate/chunk_6.txt b/chunked/nltk_chunking/_accelerate/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..04fa7fa0141fd08fa1e9487f46d3a7ec743280dd --- /dev/null +++ b/chunked/nltk_chunking/_accelerate/chunk_6.txt @@ -0,0 +1,5 @@ +from accelerate import Accelerator +accelerator = Accelerator() + +Prepare to accelerate +The next step is to pass all the relevant training objects to the [~accelerate.Accelerator.prepare] method. \ No newline at end of file diff --git a/chunked/nltk_chunking/_accelerate/chunk_7.txt b/chunked/nltk_chunking/_accelerate/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a432a98f0f266e9b0c7de7b1c763a7e2fcdfa54 --- /dev/null +++ b/chunked/nltk_chunking/_accelerate/chunk_7.txt @@ -0,0 +1,21 @@ +This includes your training and evaluation DataLoaders, a model and an optimizer: + +train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( + train_dataloader, eval_dataloader, model, optimizer + ) + +Backward +The last addition is to replace the typical loss.backward() in your training loop with 🤗 Accelerate's [~accelerate.Accelerator.backward]method: + +for epoch in range(num_epochs): + for batch in train_dataloader: + outputs = model(**batch) + loss = outputs.loss + accelerator.backward(loss) + + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) + +As you can see in the following code, you only need to add four additional lines of code to your training loop to enable distributed training! \ No newline at end of file diff --git a/chunked/nltk_chunking/_accelerate/chunk_8.txt b/chunked/nltk_chunking/_accelerate/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2e803beb10ba64f6e7fb943a92c083a19ef5d45 --- /dev/null +++ b/chunked/nltk_chunking/_accelerate/chunk_8.txt @@ -0,0 +1,41 @@ ++ from accelerate import Accelerator + from transformers import AdamW, AutoModelForSequenceClassification, get_scheduler + +accelerator = Accelerator() + +model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2) + optimizer = AdamW(model.parameters(), lr=3e-5) + +device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") + +model.to(device) + +train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare( + +train_dataloader, eval_dataloader, model, optimizer +) + +num_epochs = 3 + num_training_steps = num_epochs * len(train_dataloader) + lr_scheduler = get_scheduler( + "linear", + optimizer=optimizer, + num_warmup_steps=0, + num_training_steps=num_training_steps + ) +progress_bar = tqdm(range(num_training_steps)) +model.train() + for epoch in range(num_epochs): + for batch in train_dataloader: + + outputs = model(**batch) + loss = outputs.loss + ++ accelerator.backward(loss) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + progress_bar.update(1) + +Train +Once you've added the relevant lines of code, launch your training in a script or a notebook like Colaboratory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_accelerate/chunk_9.txt b/chunked/nltk_chunking/_accelerate/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..024946adeb7d0d458c018e88a696f4815529bdf9 --- /dev/null +++ b/chunked/nltk_chunking/_accelerate/chunk_9.txt @@ -0,0 +1,9 @@ +Train with a script +If you are running your training from a script, run the following command to create and save a configuration file: + +accelerate config +Then launch your training with: + +accelerate launch train.py +Train with a notebook +🤗 Accelerate can also run in a notebook if you're planning on using Colaboratory's TPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_0.txt b/chunked/nltk_chunking/_add_new_model/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..79df95124664158fc256391d95133cefd06b4b70 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_0.txt @@ -0,0 +1,2 @@ + +How to add a model to 🤗 Transformers? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_1.txt b/chunked/nltk_chunking/_add_new_model/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..3832523ebbfd3c650b63e57f853f649cea133574 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_1.txt @@ -0,0 +1 @@ +The 🤗 Transformers library is often able to offer new models thanks to community contributors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_10.txt b/chunked/nltk_chunking/_add_new_model/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f6d4f6b20cb3e7c37921a41970edc544bbb9d88 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_10.txt @@ -0,0 +1,2 @@ +🤗 Transformers is a very opinionated library, so there is a +chance that you don't agree with some of the library's philosophies or design choices. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_100.txt b/chunked/nltk_chunking/_add_new_model/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cc1ece3b1c8b571c52e223f5d0e11bbf53af8e7 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_100.txt @@ -0,0 +1,3 @@ +At Hugging Face, one of our main goals is to make people +stand on the shoulders of giants which translates here very well into taking a working model and rewriting it to make +it as accessible, user-friendly, and beautiful as possible. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_101.txt b/chunked/nltk_chunking/_add_new_model/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd015d814f3a863fcd47a50c4770614a9d1a3211 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_101.txt @@ -0,0 +1,2 @@ +This is the number-one motivation to re-implement +models into 🤗 Transformers - trying to make complex new NLP technology accessible to everybody. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_102.txt b/chunked/nltk_chunking/_add_new_model/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..0abc1f15760736bb2edcd7681053332619a260d6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_102.txt @@ -0,0 +1 @@ +You should start thereby by diving into the original repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_103.txt b/chunked/nltk_chunking/_add_new_model/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d307231b3309eada30ce7dbb4b72ca5a2c8d3b4 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_103.txt @@ -0,0 +1 @@ +Successfully running the official pretrained model in the original repository is often the most difficult step. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_104.txt b/chunked/nltk_chunking/_add_new_model/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..61fa675b45660e4bcc261f9f982194686434b2b6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_104.txt @@ -0,0 +1 @@ +From our experience, it is very important to spend some time getting familiar with the original code-base. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_105.txt b/chunked/nltk_chunking/_add_new_model/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a123217fa7d13b1e4302d36716da57c80b66737 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_105.txt @@ -0,0 +1,4 @@ +You need to +figure out the following: + +Where to find the pretrained weights? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_106.txt b/chunked/nltk_chunking/_add_new_model/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff094fad9da794cd14009606ee755afa0823602f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_106.txt @@ -0,0 +1 @@ +How to load the pretrained weights into the corresponding model? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_107.txt b/chunked/nltk_chunking/_add_new_model/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..32d4d4ed2867ec922606a72993806ea11aa6e2f7 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_107.txt @@ -0,0 +1 @@ +How to run the tokenizer independently from the model? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_108.txt b/chunked/nltk_chunking/_add_new_model/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..d67365230593395e25d689c666cef296e873397c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_108.txt @@ -0,0 +1 @@ +Trace one forward pass so that you know which classes and functions are required for a simple forward pass. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_109.txt b/chunked/nltk_chunking/_add_new_model/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..f051506d32b7a2454a46558408c0044f1ec08223 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_109.txt @@ -0,0 +1,2 @@ +Usually, + you only have to reimplement those functions. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_11.txt b/chunked/nltk_chunking/_add_new_model/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..74822a9ed15ea7eb73a0f10467742d6ad6a58b0a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_11.txt @@ -0,0 +1,3 @@ +From our experience, however, we +found that the fundamental design choices and philosophies of the library are crucial to efficiently scale 🤗 +Transformers while keeping maintenance costs at a reasonable level. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_110.txt b/chunked/nltk_chunking/_add_new_model/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..b67a551d1138294b0d84e29cd6116c07b40e734b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_110.txt @@ -0,0 +1 @@ +Be able to locate the important components of the model: Where is the model's class? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_111.txt b/chunked/nltk_chunking/_add_new_model/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a5d41b695bc239a77ddc52577f185c57276994b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_111.txt @@ -0,0 +1,2 @@ +Are there model sub-classes, + e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_112.txt b/chunked/nltk_chunking/_add_new_model/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..c574882cd1aafd3fe5f421634a973c1e33a931aa --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_112.txt @@ -0,0 +1 @@ +EncoderModel, DecoderModel? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_113.txt b/chunked/nltk_chunking/_add_new_model/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..5361d6f5ba114cfa9b948c88057611ae2df1ea2e --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_113.txt @@ -0,0 +1 @@ +Where is the self-attention layer? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_114.txt b/chunked/nltk_chunking/_add_new_model/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..871e84ed3d694c36ff31e0ffdcaedc9d5fa1f4a3 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_114.txt @@ -0,0 +1,2 @@ +Are there multiple different attention layers, + e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_115.txt b/chunked/nltk_chunking/_add_new_model/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..23423ead08535e4636ab6df855f2d8a8b70f928a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_115.txt @@ -0,0 +1 @@ +self-attention, cross-attention? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_116.txt b/chunked/nltk_chunking/_add_new_model/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3af70baefaf8435618f32fa2dd2700deef23be1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_116.txt @@ -0,0 +1 @@ +How can you debug the model in the original environment of the repo? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_117.txt b/chunked/nltk_chunking/_add_new_model/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..dce2f06a68f446ebb73f81167a62abe9254e6ed9 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_117.txt @@ -0,0 +1,2 @@ +Do you have to add print statements, can you + work with an interactive debugger like ipdb, or should you use an efficient IDE to debug the model, like PyCharm? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_118.txt b/chunked/nltk_chunking/_add_new_model/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f861d6cd41513ef0c9b4932d4d3ec160dfe6fd3 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_118.txt @@ -0,0 +1,2 @@ +It is very important that before you start the porting process, you can efficiently debug code in the original +repository! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_119.txt b/chunked/nltk_chunking/_add_new_model/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..734ae332fd7d470e2adcb464d1533b0b8ad31000 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_119.txt @@ -0,0 +1,2 @@ +Also, remember that you are working with an open-source library, so do not hesitate to open an issue, or +even a pull request in the original repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_12.txt b/chunked/nltk_chunking/_add_new_model/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..af8d7d7d0818ac39c1d924fcdfb4ecc18c055a16 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_12.txt @@ -0,0 +1 @@ +A good first starting point to better understand the library is to read the documentation of our philosophy. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_120.txt b/chunked/nltk_chunking/_add_new_model/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1e30488085cdd27dfd535e6c9c664ed7e1a6667 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_120.txt @@ -0,0 +1,2 @@ +The maintainers of this repository are most likely very happy about +someone looking into their code! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_121.txt b/chunked/nltk_chunking/_add_new_model/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..429efe5b2c1b3db70b7653b11cd0ff4784976429 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_121.txt @@ -0,0 +1,2 @@ +At this point, it is really up to you which debugging environment and strategy you prefer to use to debug the original +model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_122.txt b/chunked/nltk_chunking/_add_new_model/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..99c2da77d1c89b207e4d852bc24d80ef4e7e7262 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_122.txt @@ -0,0 +1,2 @@ +We strongly advise against setting up a costly GPU environment, but simply work on a CPU both when starting to +dive into the original repository and also when starting to write the 🤗 Transformers implementation of the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_123.txt b/chunked/nltk_chunking/_add_new_model/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f312ace4d2f11097b767c8c7f654a777ea8bb04 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_123.txt @@ -0,0 +1,3 @@ +Only +at the very end, when the model has already been successfully ported to 🤗 Transformers, one should verify that the +model also works as expected on GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_124.txt b/chunked/nltk_chunking/_add_new_model/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..d45b521311c9dcc0f2d70d65a1e3f97d12efbfb0 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_124.txt @@ -0,0 +1,4 @@ +In general, there are two possible debugging environments for running the original model + +Jupyter notebooks / google colab +Local python scripts. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_125.txt b/chunked/nltk_chunking/_add_new_model/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..40572f1aa281c8a53a11884dca2c4072da9858e5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_125.txt @@ -0,0 +1,2 @@ +Jupyter notebooks have the advantage that they allow for cell-by-cell execution which can be helpful to better split +logical components from one another and to have faster debugging cycles as intermediate results can be stored. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_126.txt b/chunked/nltk_chunking/_add_new_model/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..6097aaae0bbeff07f2f4e2a495a463307d164456 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_126.txt @@ -0,0 +1,3 @@ +Also, +notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging +Face team for help. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_127.txt b/chunked/nltk_chunking/_add_new_model/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac104048482dad2c5e637005468e73f4eada07c0 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_127.txt @@ -0,0 +1 @@ +If you are familiar with Jupyter notebooks, we strongly recommend you work with them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_128.txt b/chunked/nltk_chunking/_add_new_model/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa64202c1962fc817cfbd1693a7bccfcb78cd1c7 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_128.txt @@ -0,0 +1,3 @@ +The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend +some time adjusting to the new programming environment and you might not be able to use your known debugging tools +anymore, like ipdb. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_129.txt b/chunked/nltk_chunking/_add_new_model/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d679f31e06b41415a8fceab2338ac35d42b10a2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_129.txt @@ -0,0 +1,2 @@ +For each code-base, a good first step is always to load a small pretrained checkpoint and to be able to reproduce a +single forward pass using a dummy integer vector of input IDs as an input. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_13.txt b/chunked/nltk_chunking/_add_new_model/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..56d51cc5b611d6e51a4518612414dc0303cdff50 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_13.txt @@ -0,0 +1,6 @@ +As a result of our way of working, there are some choices that we try to apply to all models: + +Composition is generally favored over-abstraction +Duplicating code is not always bad if it strongly improves the readability or accessibility of a model +Model files are as self-contained as possible so that when you read the code of a specific model, you ideally only + have to look into the respective modeling_.py file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_130.txt b/chunked/nltk_chunking/_add_new_model/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b6315e657aa0e7d174658becba1485cd33f45c9 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_130.txt @@ -0,0 +1,14 @@ +Such a script could look like this (in +pseudocode): +python +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids +original_output = model.predict(input_ids) +Next, regarding the debugging strategy, there are generally a few from which to choose from: + +Decompose the original model into many small testable components and run a forward pass on each of those for + verification +Decompose the original model only into the original tokenizer and the original model, run a forward pass on + those, and use intermediate print statements or breakpoints for verification + +Again, it is up to you which strategy to choose. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_131.txt b/chunked/nltk_chunking/_add_new_model/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..9172337311e520da1ff041b14d4ab3d86e1f0dd4 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_131.txt @@ -0,0 +1,2 @@ +Often, one or the other is advantageous depending on the original code +base. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_132.txt b/chunked/nltk_chunking/_add_new_model/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdb31a8510c3887a258dbb2bed0cb83e92dd2cc2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_132.txt @@ -0,0 +1 @@ +If the original code-base allows you to decompose the model into smaller sub-components, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_133.txt b/chunked/nltk_chunking/_add_new_model/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..a00cb667ca1b87197a6ae6b37643f495838c63b1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_133.txt @@ -0,0 +1,2 @@ +if the original +code-base can easily be run in eager mode, it is usually worth the effort to do so. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_134.txt b/chunked/nltk_chunking/_add_new_model/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..31972f1c34b2b7bf817538fc7e955399b7014d6b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_134.txt @@ -0,0 +1,15 @@ +There are some important advantages +to taking the more difficult road in the beginning: + +at a later stage when comparing the original model to the Hugging Face implementation, you can verify automatically + for each component individually that the corresponding component of the 🤗 Transformers implementation matches instead + of relying on visual comparison via print statements +it can give you some rope to decompose the big problem of porting a model into smaller problems of just porting + individual components and thus structure your work better +separating the model into logical meaningful components will help you to get a better overview of the model's design + and thus to better understand the model +at a later stage those component-by-component tests help you to ensure that no regression occurs as you continue + changing your code + +Lysandre's integration checks for ELECTRA +gives a nice example of how this can be done. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_135.txt b/chunked/nltk_chunking/_add_new_model/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..76a97ad26251c8c48471920e80e88db24dfc8093 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_135.txt @@ -0,0 +1,2 @@ +However, if the original code-base is very complex or only allows intermediate components to be run in a compiled mode, +it might be too time-consuming or even impossible to separate the model into smaller testable sub-components. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_136.txt b/chunked/nltk_chunking/_add_new_model/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..a30cc7dba977d15c47c1bee69d80b9f9cae55507 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_136.txt @@ -0,0 +1,3 @@ +A good +example is T5's MeshTensorFlow library which is +very complex and does not offer a simple way to decompose the model into its sub-components. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_137.txt b/chunked/nltk_chunking/_add_new_model/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..51cfc1aff38b3b626990b30b2618bb185b0ab8f2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_137.txt @@ -0,0 +1,2 @@ +For such libraries, one +often relies on verifying print statements. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_138.txt b/chunked/nltk_chunking/_add_new_model/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..1929816d1481648cc90db99a9967a8e0bcee1598 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_138.txt @@ -0,0 +1,2 @@ +No matter which strategy you choose, the recommended procedure is often the same that you should start to debug the +starting layers first and the ending layers last. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_139.txt b/chunked/nltk_chunking/_add_new_model/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..3546ecda887f8a4254002ab5159ccef1b05fdd0c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_139.txt @@ -0,0 +1,11 @@ +It is recommended that you retrieve the output, either by print statements or sub-component functions, of the following +layers in the following order: + +Retrieve the input IDs passed to the model +Retrieve the word embeddings +Retrieve the input of the first Transformer layer +Retrieve the output of the first Transformer layer +Retrieve the output of the following n - 1 Transformer layers +Retrieve the output of the whole BrandNewBert Model + +Input IDs should thereby consists of an array of integers, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_14.txt b/chunked/nltk_chunking/_add_new_model/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ac3b521faeec2f4daaa2f6dd270b74c83362657 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_14.txt @@ -0,0 +1 @@ +In our opinion, the library's code is not just a means to provide a product, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_140.txt b/chunked/nltk_chunking/_add_new_model/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..80ba474f256798040afeb1e209c43f1c90633c75 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_140.txt @@ -0,0 +1,12 @@ +input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] +The outputs of the following layers often consist of multi-dimensional float arrays and can look like this: +[[ + [-0.1465, -0.6501, 0.1993, , 0.1451, 0.3430, 0.6024], + [-0.4417, -0.5920, 0.3450, , -0.3062, 0.6182, 0.7132], + [-0.5009, -0.7122, 0.4548, , -0.3662, 0.6091, 0.7648], + , + [-0.5613, -0.6332, 0.4324, , -0.3792, 0.7372, 0.9288], + [-0.5416, -0.6345, 0.4180, , -0.3564, 0.6992, 0.9191], + [-0.5334, -0.6403, 0.4271, , -0.3339, 0.6533, 0.8694]]], +We expect that every model added to 🤗 Transformers passes a couple of integration tests, meaning that the original +model and the reimplemented version in 🤗 Transformers have to give the exact same output up to a precision of 0.001! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_141.txt b/chunked/nltk_chunking/_add_new_model/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..d05ae114a447fa17704ae40457562eb232be4ae6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_141.txt @@ -0,0 +1,2 @@ +Since it is normal that the exact same model written in different libraries can give a slightly different output +depending on the library framework, we accept an error tolerance of 1e-3 (0.001). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_142.txt b/chunked/nltk_chunking/_add_new_model/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..bec06283f2c0efefc44ea5a744bc9b1ffc72ed9d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_142.txt @@ -0,0 +1,2 @@ +It is not enough if the model gives +nearly the same output, they have to be almost identical. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_143.txt b/chunked/nltk_chunking/_add_new_model/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdf8676769ff9fde61618dba9f0e72531667e4d6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_143.txt @@ -0,0 +1,4 @@ +Therefore, you will certainly compare the intermediate +outputs of the 🤗 Transformers version multiple times against the intermediate outputs of the original implementation of +brand_new_bert in which case an efficient debugging environment of the original repository is absolutely +important. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_144.txt b/chunked/nltk_chunking/_add_new_model/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f000a98e77aa73e96a25a57cdfb044fe98a7ce7 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_144.txt @@ -0,0 +1 @@ +Here is some advice to make your debugging environment as efficient as possible. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_145.txt b/chunked/nltk_chunking/_add_new_model/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..de3d0cb221a7d1a8f93b80b256c6a64d5f590c5c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_145.txt @@ -0,0 +1 @@ +Find the best way of debugging intermediate results. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_146.txt b/chunked/nltk_chunking/_add_new_model/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a258eac99324b97465d3168b194b2be47cec15f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_146.txt @@ -0,0 +1 @@ +Is the original repository written in PyTorch? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_147.txt b/chunked/nltk_chunking/_add_new_model/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8eb04ea464b73d24f8fedc956972a09132e2205 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_147.txt @@ -0,0 +1,3 @@ +Then you should + probably take the time to write a longer script that decomposes the original model into smaller sub-components to + retrieve intermediate values. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_148.txt b/chunked/nltk_chunking/_add_new_model/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4b6579a6c102d69b4db6ca033403de022c2ab1c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_148.txt @@ -0,0 +1 @@ +Is the original repository written in Tensorflow 1? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_149.txt b/chunked/nltk_chunking/_add_new_model/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..866020c602b07a6d1d8c0b9fb7d1c193b68dc1fa --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_149.txt @@ -0,0 +1,3 @@ +Then you might have to rely on + TensorFlow print operations like tf.print to output + intermediate values. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_15.txt b/chunked/nltk_chunking/_add_new_model/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..abbc8e198b2ec9d75a8958894e0efcd92d9e2871 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_15.txt @@ -0,0 +1,2 @@ +the ability to use BERT for +inference, but also as the very product that we want to improve. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_150.txt b/chunked/nltk_chunking/_add_new_model/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7dfb5e9aa4e9f1b527de361b72dc5effd797add --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_150.txt @@ -0,0 +1 @@ +Is the original repository written in Jax? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_151.txt b/chunked/nltk_chunking/_add_new_model/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d3da63dbaee1a198a3d03163be3485936815582 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_151.txt @@ -0,0 +1,2 @@ +Then make sure that the model is not jitted when + running the forward pass, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_152.txt b/chunked/nltk_chunking/_add_new_model/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..231ad7fbf98c8740bbbe6defa4c1b72133d64f6a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_152.txt @@ -0,0 +1 @@ +check-out this link. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_153.txt b/chunked/nltk_chunking/_add_new_model/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..87a716efa90892fe05edce5066c9fb9ad005ea6d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_153.txt @@ -0,0 +1 @@ +Use the smallest pretrained checkpoint you can find. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_154.txt b/chunked/nltk_chunking/_add_new_model/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a996b7ab55e3aae67bd7d43913a6c90a59d7ca9 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_154.txt @@ -0,0 +1,2 @@ +The smaller the checkpoint, the faster your debug cycle + becomes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_155.txt b/chunked/nltk_chunking/_add_new_model/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..9985f22e81ab9abd67ef64d6176fc5e8765b3024 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_155.txt @@ -0,0 +1 @@ +It is not efficient if your pretrained model is so big that your forward pass takes more than 10 seconds. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_156.txt b/chunked/nltk_chunking/_add_new_model/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..b897e691ec07ef4806f676317103785b7e82d427 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_156.txt @@ -0,0 +1,4 @@ +In case only very large checkpoints are available, it might make more sense to create a dummy model in the new + environment with randomly initialized weights and save those weights for comparison with the 🤗 Transformers version + of your model +Make sure you are using the easiest way of calling a forward pass in the original repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_157.txt b/chunked/nltk_chunking/_add_new_model/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f56f5c037c741a2de8d9d127ccaa91a76eb5e43 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_157.txt @@ -0,0 +1,2 @@ +Ideally, you want to + find the function in the original repository that only calls a single forward pass, i.e. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_158.txt b/chunked/nltk_chunking/_add_new_model/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..45d7ca6bf9bc53d7f938075671afe90cc671965a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_158.txt @@ -0,0 +1,2 @@ +that is often called + predict, evaluate, forward or __call__. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_159.txt b/chunked/nltk_chunking/_add_new_model/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..844706970d8c2eb5e30bc43f371426c41e8a51ee --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_159.txt @@ -0,0 +1,2 @@ +You don't want to debug a function that calls forward + multiple times, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_16.txt b/chunked/nltk_chunking/_add_new_model/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b0f73457eb3157178e330d60e82f3e24a8efa73 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_16.txt @@ -0,0 +1,2 @@ +Hence, when adding a model, the user is not only the +person who will use your model, but also everybody who will read, try to understand, and possibly tweak your code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_160.txt b/chunked/nltk_chunking/_add_new_model/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..9695868891cb91604f99bdbf0a40b49537b2b38d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_160.txt @@ -0,0 +1 @@ +to generate text, like autoregressive_sample, generate. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_161.txt b/chunked/nltk_chunking/_add_new_model/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..71b9c0426f33bde83b28835e64351678df9468ef --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_161.txt @@ -0,0 +1 @@ +Try to separate the tokenization from the model's forward pass. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_162.txt b/chunked/nltk_chunking/_add_new_model/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6a6e4bf29cc79f604a8cd180600c8d581690968 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_162.txt @@ -0,0 +1,3 @@ +If the original repository shows examples where + you have to input a string, then try to find out where in the forward call the string input is changed to input ids + and start from this point. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_163.txt b/chunked/nltk_chunking/_add_new_model/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..f98896b9b90f59f86d90e55ecd19aa6ec1a80084 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_163.txt @@ -0,0 +1,2 @@ +This might mean that you have to possibly write a small script yourself or change the + original code so that you can directly input the ids instead of an input string. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_164.txt b/chunked/nltk_chunking/_add_new_model/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c807e04a9b251b1a977266680f0b86d774644c8 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_164.txt @@ -0,0 +1,2 @@ +Make sure that the model in your debugging setup is not in training mode, which often causes the model to yield + random outputs due to multiple dropout layers in the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_165.txt b/chunked/nltk_chunking/_add_new_model/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7c114205d6faa0b16e7419b81f7666540d59bde --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_165.txt @@ -0,0 +1,2 @@ +Make sure that the forward pass in your debugging + environment is deterministic so that the dropout layers are not used. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_166.txt b/chunked/nltk_chunking/_add_new_model/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..f50f1b2435214f6faf8192ee9267551124b732cb --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_166.txt @@ -0,0 +1,2 @@ +Or use transformers.utils.set_seed + if the old and new implementations are in the same framework. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_167.txt b/chunked/nltk_chunking/_add_new_model/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..de14cb3db1f36162db6d8d42ba09f80eb8df5c5d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_167.txt @@ -0,0 +1 @@ +The following section gives you more specific details/tips on how you can do this for brand_new_bert. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_168.txt b/chunked/nltk_chunking/_add_new_model/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c54c94f375df096d1fd6cd50a3abd1216690f24 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_168.txt @@ -0,0 +1 @@ +5.-14. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_169.txt b/chunked/nltk_chunking/_add_new_model/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..10b4306027fb57a9e03ab1897e71c50523ced723 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_169.txt @@ -0,0 +1,2 @@ +Port BrandNewBert to 🤗 Transformers +Next, you can finally start adding new code to 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_17.txt b/chunked/nltk_chunking/_add_new_model/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..47eaa26f24992a2c89e4df601153472b25a72e3a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_17.txt @@ -0,0 +1 @@ +With this in mind, let's go a bit deeper into the general library design. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_170.txt b/chunked/nltk_chunking/_add_new_model/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fc465134e51f37734cb22745a2fd55395df50e8 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_170.txt @@ -0,0 +1,5 @@ +Go into the clone of your 🤗 Transformers' fork: + +cd transformers +In the special case that you are adding a model whose architecture exactly matches the model architecture of an +existing model you only have to add a conversion script as described in this section. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_171.txt b/chunked/nltk_chunking/_add_new_model/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..f74f296901ac384ac76d321723e05281cd4d17b6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_171.txt @@ -0,0 +1 @@ +In this case, you can just re-use the whole model architecture of the already existing model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_172.txt b/chunked/nltk_chunking/_add_new_model/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..4837ec3aba8dc67c40da7bfdc2c51a18f8081cf4 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_172.txt @@ -0,0 +1 @@ +Otherwise, let's start generating a new model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_173.txt b/chunked/nltk_chunking/_add_new_model/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5e2b852e764ff3aef280032556dbe281d0ca8c8 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_173.txt @@ -0,0 +1,6 @@ +You have two choices here: + +transformers-cli add-new-model-like to add a new model like an existing one +transformers-cli add-new-model to add a new model from our template (will look like BERT or Bart depending on the type of model you select) + +In both cases, you will be prompted with a questionnaire to fill in the basic information of your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_174.txt b/chunked/nltk_chunking/_add_new_model/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..04a77d68ff56dfd0d5791e603385d60439c725f5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_174.txt @@ -0,0 +1 @@ +The second command requires to install cookiecutter, you can find more information on it here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_175.txt b/chunked/nltk_chunking/_add_new_model/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..a146db85e7038d96bd2510988a3b1df99a46625d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_175.txt @@ -0,0 +1,3 @@ +Open a Pull Request on the main huggingface/transformers repo +Before starting to adapt the automatically generated code, now is the time to open a “Work in progress (WIP)†pull +request, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_176.txt b/chunked/nltk_chunking/_add_new_model/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..b372d2cb23773351495a28c6dc73c1be13c73b91 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_176.txt @@ -0,0 +1,2 @@ +“[WIP] Add brand_new_bertâ€, in 🤗 Transformers so that you and the Hugging Face team can work +side-by-side on integrating the model into 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_177.txt b/chunked/nltk_chunking/_add_new_model/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..02257e24b80559b17fc34a889ae2aa091fbbef6b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_177.txt @@ -0,0 +1,9 @@ +You should do the following: + +Create a branch with a descriptive name from your main branch + +git checkout -b add_brand_new_bert + +Commit the automatically generated code: + +git add . \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_178.txt b/chunked/nltk_chunking/_add_new_model/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..21b36c117ce33eea391c5042faf260c1456f3b85 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_178.txt @@ -0,0 +1,12 @@ +git commit + +Fetch and rebase to current main + +git fetch upstream +git rebase upstream/main + +Push the changes to your account using: + +git push -u origin a-descriptive-name-for-my-changes + +Once you are satisfied, go to the webpage of your fork on GitHub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_179.txt b/chunked/nltk_chunking/_add_new_model/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..a75556052af8e2c70205a6902f1b28b7aa5eb5f5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_179.txt @@ -0,0 +1 @@ +Click on “Pull requestâ€. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_18.txt b/chunked/nltk_chunking/_add_new_model/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..21dd6e9fc0e2c1d550befb7f4d8a8bdc61c9c2a8 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_18.txt @@ -0,0 +1,3 @@ +Overview of models +To successfully add a model, it is important to understand the interaction between your model and its config, +[PreTrainedModel], and [PretrainedConfig]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_180.txt b/chunked/nltk_chunking/_add_new_model/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d0b1892d3343828da4f6090d9709dfe62779eb1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_180.txt @@ -0,0 +1,3 @@ +Make sure to add the + GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for + future changes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_181.txt b/chunked/nltk_chunking/_add_new_model/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..db3be93eee9eb5990852f4089e0fd533d85a1ef4 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_181.txt @@ -0,0 +1 @@ +Change the PR into a draft by clicking on “Convert to draft†on the right of the GitHub pull request web page. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_182.txt b/chunked/nltk_chunking/_add_new_model/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..9abfb8a4904a437caf00b15fea1dd5663562bd54 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_182.txt @@ -0,0 +1,2 @@ +In the following, whenever you have made some progress, don't forget to commit your work and push it to your account so +that it shows in the pull request. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_183.txt b/chunked/nltk_chunking/_add_new_model/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f27fc9e4e09680bd654a9f5ca4cbd625596f7bc --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_183.txt @@ -0,0 +1,7 @@ +Additionally, you should make sure to update your work with the current main from +time to time by doing: + +git fetch upstream +git merge upstream/main +In general, all questions you might have regarding the model or your implementation should be asked in your PR and +discussed/solved in the PR. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_184.txt b/chunked/nltk_chunking/_add_new_model/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..44eeceea4beec90f956f0fd3641d81160c1776ba --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_184.txt @@ -0,0 +1,2 @@ +This way, the Hugging Face team will always be notified when you are committing new code or +if you have a question. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_185.txt b/chunked/nltk_chunking/_add_new_model/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..950fa71e9b699d0c474738d2dc7e7f84749dcedd --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_185.txt @@ -0,0 +1,2 @@ +It is often very helpful to point the Hugging Face team to your added code so that the Hugging +Face team can efficiently understand your problem or question. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_186.txt b/chunked/nltk_chunking/_add_new_model/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..8664d336dfccbda80775842d4b5fc0df4eab71d8 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_186.txt @@ -0,0 +1,2 @@ +To do so, you can go to the “Files changed†tab where you see all of your changes, go to a line regarding which you +want to ask a question, and click on the “+†symbol to add a comment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_187.txt b/chunked/nltk_chunking/_add_new_model/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..47e37150749b1c31009d9d2c8348aede9584aa77 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_187.txt @@ -0,0 +1,2 @@ +Whenever a question or problem has been solved, +you can click on the “Resolve†button of the created comment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_188.txt b/chunked/nltk_chunking/_add_new_model/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..281d6eade351291891424f2daf51e99f0bf92194 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_188.txt @@ -0,0 +1 @@ +In the same way, the Hugging Face team will open comments when reviewing your code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_189.txt b/chunked/nltk_chunking/_add_new_model/chunk_189.txt new file mode 100644 index 0000000000000000000000000000000000000000..b130efda0cc87afa64f8a4f50daf230eaeb6819e --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_189.txt @@ -0,0 +1,2 @@ +We recommend asking most questions +on GitHub on your PR. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_19.txt b/chunked/nltk_chunking/_add_new_model/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..934a43f7fccce0434311867defb54c0f92434df2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_19.txt @@ -0,0 +1,2 @@ +For exemplary purposes, we will +call the model to be added to 🤗 Transformers BrandNewBert. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_190.txt b/chunked/nltk_chunking/_add_new_model/chunk_190.txt new file mode 100644 index 0000000000000000000000000000000000000000..b11eef8e9bcf08caa9c5dc9374dd39d8e7b0ed6f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_190.txt @@ -0,0 +1,2 @@ +For some very general questions that are not very useful for the public, feel free to ping the +Hugging Face team by Slack or email. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_191.txt b/chunked/nltk_chunking/_add_new_model/chunk_191.txt new file mode 100644 index 0000000000000000000000000000000000000000..91dcb6a87071975adc555db222107a0056de804e --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_191.txt @@ -0,0 +1 @@ +5. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_192.txt b/chunked/nltk_chunking/_add_new_model/chunk_192.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7bb0e35d357ff7428f648ed76c7f9fabafac981 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_192.txt @@ -0,0 +1,2 @@ +Adapt the generated models code for brand_new_bert +At first, we will focus only on the model itself and not care about the tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_193.txt b/chunked/nltk_chunking/_add_new_model/chunk_193.txt new file mode 100644 index 0000000000000000000000000000000000000000..728e564f4b17598ff3116f059ddb81c8dcedd4c4 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_193.txt @@ -0,0 +1,3 @@ +All the relevant code should be +found in the generated files src/transformers/models/brand_new_bert/modeling_brand_new_bert.py and +src/transformers/models/brand_new_bert/configuration_brand_new_bert.py. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_194.txt b/chunked/nltk_chunking/_add_new_model/chunk_194.txt new file mode 100644 index 0000000000000000000000000000000000000000..30fbb01f0b4cbad29eb2f4bfd1bbf0514eb1ac15 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_194.txt @@ -0,0 +1 @@ +Now you can finally start coding :). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_195.txt b/chunked/nltk_chunking/_add_new_model/chunk_195.txt new file mode 100644 index 0000000000000000000000000000000000000000..4667b224597124cc51b5d50477bcc0ee04100f22 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_195.txt @@ -0,0 +1,3 @@ +The generated code in +src/transformers/models/brand_new_bert/modeling_brand_new_bert.py will either have the same architecture as BERT if +it's an encoder-only model or BART if it's an encoder-decoder model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_196.txt b/chunked/nltk_chunking/_add_new_model/chunk_196.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2d7256d924a69b860ca7c5dc780624b85f9d4d3 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_196.txt @@ -0,0 +1,3 @@ +At this point, you should remind yourself what +you've learned in the beginning about the theoretical aspects of the model: How is the model different from BERT or +BART?". \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_197.txt b/chunked/nltk_chunking/_add_new_model/chunk_197.txt new file mode 100644 index 0000000000000000000000000000000000000000..0778f79dad9d94f3d3e466109f6d8a5825427144 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_197.txt @@ -0,0 +1,3 @@ +Implement those changes which often means changing the self-attention layer, the order of the normalization +layer, etc… Again, it is often useful to look at the similar architecture of already existing models in Transformers to +get a better feeling of how your model should be implemented. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_198.txt b/chunked/nltk_chunking/_add_new_model/chunk_198.txt new file mode 100644 index 0000000000000000000000000000000000000000..617dc6ef7b385b3d644d80c28af289cb31218bf6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_198.txt @@ -0,0 +1 @@ +Note that at this point, you don't have to be very sure that your code is fully correct or clean. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_199.txt b/chunked/nltk_chunking/_add_new_model/chunk_199.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cf72fd0f3604e71f8f851b9cefa4797688f86e1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_199.txt @@ -0,0 +1,4 @@ +Rather, it is +advised to add a first unclean, copy-pasted version of the original code to +src/transformers/models/brand_new_bert/modeling_brand_new_bert.py until you feel like all the necessary code is +added. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_2.txt b/chunked/nltk_chunking/_add_new_model/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..d17e9e7bf3df12fa4b1ed97a5bdf7d0001757345 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_2.txt @@ -0,0 +1 @@ +But this can be a challenging project and requires an in-depth knowledge of the 🤗 Transformers library and the model to implement. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_20.txt b/chunked/nltk_chunking/_add_new_model/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffceb1d6a4c1fcd6ecc7e4c7ed0a2540b58ea2fc --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_20.txt @@ -0,0 +1,4 @@ +Let's take a look: + +As you can see, we do make use of inheritance in 🤗 Transformers, but we keep the level of abstraction to an absolute +minimum. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_200.txt b/chunked/nltk_chunking/_add_new_model/chunk_200.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b308849d03dd1265283af2b259ec9f78bef9d73 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_200.txt @@ -0,0 +1,2 @@ +From our experience, it is much more efficient to quickly add a first version of the required code and +improve/correct the code iteratively with the conversion script as described in the next section. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_201.txt b/chunked/nltk_chunking/_add_new_model/chunk_201.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9a28f708d30cd6d3eefb688b38c21a1207018a1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_201.txt @@ -0,0 +1,2 @@ +The only thing that +has to work at this point is that you can instantiate the 🤗 Transformers implementation of brand_new_bert, i.e. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_202.txt b/chunked/nltk_chunking/_add_new_model/chunk_202.txt new file mode 100644 index 0000000000000000000000000000000000000000..2da1b1f213b3326df566fc4b1e6e7f775c1e6d53 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_202.txt @@ -0,0 +1,8 @@ +the +following command should work: +thon +from transformers import BrandNewBertModel, BrandNewBertConfig +model = BrandNewBertModel(BrandNewBertConfig()) + +The above command will create a model according to the default parameters as defined in BrandNewBertConfig() with +random weights, thus making sure that the init() methods of all components works. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_203.txt b/chunked/nltk_chunking/_add_new_model/chunk_203.txt new file mode 100644 index 0000000000000000000000000000000000000000..68a96ce1ece4eec915575bcda96321a7e4283f80 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_203.txt @@ -0,0 +1,2 @@ +Note that all random initialization should happen in the _init_weights method of your BrandnewBertPreTrainedModel +class. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_204.txt b/chunked/nltk_chunking/_add_new_model/chunk_204.txt new file mode 100644 index 0000000000000000000000000000000000000000..a70cb3780ea5cd1dd32e6382d01eac3982aeccb5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_204.txt @@ -0,0 +1 @@ +It should initialize all leaf modules depending on the variables of the config. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_205.txt b/chunked/nltk_chunking/_add_new_model/chunk_205.txt new file mode 100644 index 0000000000000000000000000000000000000000..105386cfe99da3b416269e4c41c34e53d1d45d70 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_205.txt @@ -0,0 +1,17 @@ +Here is an example with the +BERT _init_weights method: +py +def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) +You can have some more custom schemes if you need a special initialization for some modules. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_206.txt b/chunked/nltk_chunking/_add_new_model/chunk_206.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7dc83ef1106badd6452f0ea0eea91b8e1b4d05d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_206.txt @@ -0,0 +1,3 @@ +For instance, in +Wav2Vec2ForPreTraining, the last two linear layers need to have the initialization of the regular PyTorch nn.Linear +but all the other ones should use an initialization as above. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_207.txt b/chunked/nltk_chunking/_add_new_model/chunk_207.txt new file mode 100644 index 0000000000000000000000000000000000000000..90d95220ca5265b1cdab721d81f83952a39fcc72 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_207.txt @@ -0,0 +1,14 @@ +This is coded like this: +py +def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, Wav2Vec2ForPreTraining): + module.project_hid.reset_parameters() + module.project_q.reset_parameters() + module.project_hid._is_hf_initialized = True + module.project_q._is_hf_initialized = True + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() +The _is_hf_initialized flag is internally used to make sure we only initialize a submodule once. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_208.txt b/chunked/nltk_chunking/_add_new_model/chunk_208.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb53869a85b6488d6874976a894da456ee646d74 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_208.txt @@ -0,0 +1,3 @@ +By setting it to +True for module.project_q and module.project_hid, we make sure the custom initialization we did is not overridden later on, +the _init_weights function won't be applied to them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_209.txt b/chunked/nltk_chunking/_add_new_model/chunk_209.txt new file mode 100644 index 0000000000000000000000000000000000000000..66aa82048927681af984d35e45d694eaf46a34f9 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_209.txt @@ -0,0 +1 @@ +6. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_21.txt b/chunked/nltk_chunking/_add_new_model/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..16cbb866468fb821798dd633b3c296ba139144cc --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_21.txt @@ -0,0 +1 @@ +There are never more than two levels of abstraction for any model in the library. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_210.txt b/chunked/nltk_chunking/_add_new_model/chunk_210.txt new file mode 100644 index 0000000000000000000000000000000000000000..66184a1f2b149a440faedfacc9ab4e68ba46b1c8 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_210.txt @@ -0,0 +1,4 @@ +Write a conversion script +Next, you should write a conversion script that lets you convert the checkpoint you used to debug brand_new_bert in +the original repository to a checkpoint compatible with your just created 🤗 Transformers implementation of +brand_new_bert. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_211.txt b/chunked/nltk_chunking/_add_new_model/chunk_211.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9c6a46c536f913af70da90c6f274d1b0ffadeb3 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_211.txt @@ -0,0 +1,3 @@ +It is not advised to write the conversion script from scratch, but rather to look through already +existing conversion scripts in 🤗 Transformers for one that has been used to convert a similar model that was written in +the same framework as brand_new_bert. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_212.txt b/chunked/nltk_chunking/_add_new_model/chunk_212.txt new file mode 100644 index 0000000000000000000000000000000000000000..9472cd92950ede7f9cdb09f03f61c4481a5c3224 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_212.txt @@ -0,0 +1,2 @@ +Usually, it is enough to copy an already existing conversion script and +slightly adapt it for your use case. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_213.txt b/chunked/nltk_chunking/_add_new_model/chunk_213.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcff5a6e408ccbf64cba9a98e910107c483948d6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_213.txt @@ -0,0 +1,2 @@ +Don't hesitate to ask the Hugging Face team to point you to a similar already +existing conversion script for your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_214.txt b/chunked/nltk_chunking/_add_new_model/chunk_214.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3f00cb2fa43191427b0c935c9c99b85e26d5289 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_214.txt @@ -0,0 +1,4 @@ +If you are porting a model from TensorFlow to PyTorch, a good starting point might be BERT's conversion script here +If you are porting a model from PyTorch to PyTorch, a good starting point might be BART's conversion script here + +In the following, we'll quickly explain how PyTorch models store layer weights and define layer names. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_215.txt b/chunked/nltk_chunking/_add_new_model/chunk_215.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6e680681af74cecac09590c9a37a88d4191ce4b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_215.txt @@ -0,0 +1,2 @@ +In PyTorch, the +name of a layer is defined by the name of the class attribute you give the layer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_216.txt b/chunked/nltk_chunking/_add_new_model/chunk_216.txt new file mode 100644 index 0000000000000000000000000000000000000000..32f30eba53fad9c9f72e307f6c218f65f1778457 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_216.txt @@ -0,0 +1,13 @@ +Let's define a dummy model in +PyTorch, called SimpleModel as follows: +thon +from torch import nn +class SimpleModel(nn.Module): + def init(self): + super().init() + self.dense = nn.Linear(10, 10) + self.intermediate = nn.Linear(10, 10) + self.layer_norm = nn.LayerNorm(10) + +Now we can create an instance of this model definition which will fill all weights: dense, intermediate, +layer_norm with random weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_217.txt b/chunked/nltk_chunking/_add_new_model/chunk_217.txt new file mode 100644 index 0000000000000000000000000000000000000000..00e8e5ad7d2169d975fc0b7edcb6fb1847827101 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_217.txt @@ -0,0 +1,12 @@ +We can print the model to see its architecture +thon +model = SimpleModel() +print(model) + +This will print out the following: +SimpleModel( + (dense): Linear(in_features=10, out_features=10, bias=True) + (intermediate): Linear(in_features=10, out_features=10, bias=True) + (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True) +) +We can see that the layer names are defined by the name of the class attribute in PyTorch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_218.txt b/chunked/nltk_chunking/_add_new_model/chunk_218.txt new file mode 100644 index 0000000000000000000000000000000000000000..94071fef4985299b4647dbf0859ed3a3d638d398 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_218.txt @@ -0,0 +1,25 @@ +You can print out the weight +values of a specific layer: +python +print(model.dense.weight.data) +to see that the weights were randomly initialized +tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212, + -0.2077, 0.2157], + [ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190, + 0.2166, -0.0212], + [-0.2000, 0.1107, -0.1999, -0.3119, 0.1559, 0.0993, 0.1776, -0.1950, + -0.1023, -0.0447], + [-0.0888, -0.1092, 0.2281, 0.0336, 0.1817, -0.0115, 0.2096, 0.1415, + -0.1876, -0.2467], + [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465, + 0.2577, 0.0402], + [ 0.1502, 0.2465, 0.2566, 0.0693, 0.2352, -0.0530, 0.1859, -0.0604, + 0.2132, 0.1680], + [ 0.1733, -0.2407, -0.1721, 0.1484, 0.0358, -0.0633, -0.0721, -0.0090, + 0.2707, -0.2509], + [-0.1173, 0.1561, 0.2945, 0.0595, -0.1996, 0.2988, -0.0802, 0.0407, + 0.1829, -0.1568], + [-0.1164, -0.2228, -0.0403, 0.0428, 0.1339, 0.0047, 0.1967, 0.2923, + 0.0333, -0.0536], + [-0.1492, -0.1616, 0.1057, 0.1950, -0.2807, -0.2710, -0.1586, 0.0739, + 0.2220, 0.2358]]). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_219.txt b/chunked/nltk_chunking/_add_new_model/chunk_219.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc28efe5bffc01cd87b14b1b1c6e1dfd6574a981 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_219.txt @@ -0,0 +1,2 @@ +In the conversion script, you should fill those randomly initialized weights with the exact weights of the +corresponding layer in the checkpoint. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_22.txt b/chunked/nltk_chunking/_add_new_model/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..44e79da6d842ed8b381100c699ffd2b2f311a8f7 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_22.txt @@ -0,0 +1,3 @@ +BrandNewBertModel +inherits from BrandNewBertPreTrainedModel which in turn inherits from [PreTrainedModel] and +that's it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_220.txt b/chunked/nltk_chunking/_add_new_model/chunk_220.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c6c42b6981712ccfb35860e2422a687ff0e1372 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_220.txt @@ -0,0 +1 @@ +E.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_221.txt b/chunked/nltk_chunking/_add_new_model/chunk_221.txt new file mode 100644 index 0000000000000000000000000000000000000000..07093513e6cd51b9ccc61554ba263ca33797e8dc --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_221.txt @@ -0,0 +1,2 @@ +thon +retrieve matching layer weights, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_222.txt b/chunked/nltk_chunking/_add_new_model/chunk_222.txt new file mode 100644 index 0000000000000000000000000000000000000000..63211ccfda5787aed13bfdaf7c96807fc943d8a1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_222.txt @@ -0,0 +1,9 @@ +by +recursive algorithm +layer_name = "dense" +pretrained_weight = array_of_dense_layer +model_pointer = getattr(model, "dense") +model_pointer.weight.data = torch.from_numpy(pretrained_weight) + +While doing so, you must verify that each randomly initialized weight of your PyTorch model and its corresponding +pretrained checkpoint weight exactly match in both shape and name. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_223.txt b/chunked/nltk_chunking/_add_new_model/chunk_223.txt new file mode 100644 index 0000000000000000000000000000000000000000..c89542736b3ecc485ddee8779e04dbdec5f9545c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_223.txt @@ -0,0 +1,2 @@ +To do so, it is necessary to add assert +statements for the shape and print out the names of the checkpoints weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_224.txt b/chunked/nltk_chunking/_add_new_model/chunk_224.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c6c42b6981712ccfb35860e2422a687ff0e1372 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_224.txt @@ -0,0 +1 @@ +E.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_225.txt b/chunked/nltk_chunking/_add_new_model/chunk_225.txt new file mode 100644 index 0000000000000000000000000000000000000000..55c0df20150fc04c24743c36de8eac76cc884e32 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_225.txt @@ -0,0 +1,6 @@ +you should add statements like: +python +assert ( + model_pointer.weight.shape == pretrained_weight.shape +), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched" +Besides, you should also print out the names of both weights to make sure they match, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_226.txt b/chunked/nltk_chunking/_add_new_model/chunk_226.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0525810cff9a46064db54533ffffb344234c595 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_226.txt @@ -0,0 +1,4 @@ +python +logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}") +If either the shape or the name doesn't match, you probably assigned the wrong checkpoint weight to a randomly +initialized layer of the 🤗 Transformers implementation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_227.txt b/chunked/nltk_chunking/_add_new_model/chunk_227.txt new file mode 100644 index 0000000000000000000000000000000000000000..239fd53c4195dc7925e785224a054f5e0927abcf --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_227.txt @@ -0,0 +1,2 @@ +An incorrect shape is most likely due to an incorrect setting of the config parameters in BrandNewBertConfig() that +do not exactly match those that were used for the checkpoint you want to convert. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_228.txt b/chunked/nltk_chunking/_add_new_model/chunk_228.txt new file mode 100644 index 0000000000000000000000000000000000000000..a25fa1acd6ff3dfb795bde42d25a7e0dcf8fbe38 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_228.txt @@ -0,0 +1,2 @@ +However, it could also be that +PyTorch's implementation of a layer requires the weight to be transposed beforehand. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_229.txt b/chunked/nltk_chunking/_add_new_model/chunk_229.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b923fa7e444c57a02ad5aba2ff3746b4250e98d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_229.txt @@ -0,0 +1,2 @@ +Finally, you should also check that all required weights are initialized and print out all checkpoint weights that +were not used for initialization to make sure the model is correctly converted. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_23.txt b/chunked/nltk_chunking/_add_new_model/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..edb5639e25a068d545b5dcf5cc0b952db99efbce --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_23.txt @@ -0,0 +1,2 @@ +As a general rule, we want to make sure that a new model only depends on +[PreTrainedModel]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_230.txt b/chunked/nltk_chunking/_add_new_model/chunk_230.txt new file mode 100644 index 0000000000000000000000000000000000000000..07934c396742e731ba5ce4f0e4b96998da8ea414 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_230.txt @@ -0,0 +1,2 @@ +It is completely normal, that the +conversion trials fail with either a wrong shape statement or a wrong name assignment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_231.txt b/chunked/nltk_chunking/_add_new_model/chunk_231.txt new file mode 100644 index 0000000000000000000000000000000000000000..08dc7f4d3c8aa0e47854abddf3970110616a9edf --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_231.txt @@ -0,0 +1,4 @@ +This is most likely because either +you used incorrect parameters in BrandNewBertConfig(), have a wrong architecture in the 🤗 Transformers +implementation, you have a bug in the init() functions of one of the components of the 🤗 Transformers +implementation or you need to transpose one of the checkpoint weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_232.txt b/chunked/nltk_chunking/_add_new_model/chunk_232.txt new file mode 100644 index 0000000000000000000000000000000000000000..b01c473ec4cb774538a3b9567d4affa20d7427af --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_232.txt @@ -0,0 +1,2 @@ +This step should be iterated with the previous step until all weights of the checkpoint are correctly loaded in the +Transformers model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_233.txt b/chunked/nltk_chunking/_add_new_model/chunk_233.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c5084595221d56a964d52bee3671e5d78692ad1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_233.txt @@ -0,0 +1,6 @@ +Having correctly loaded the checkpoint into the 🤗 Transformers implementation, you can then save +the model under a folder of your choice /path/to/converted/checkpoint/folder that should then contain both a +pytorch_model.bin file and a config.json file: +python +model.save_pretrained("/path/to/converted/checkpoint/folder") +7. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_234.txt b/chunked/nltk_chunking/_add_new_model/chunk_234.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa3f326725f776a9312de4d3bb89dea88fb53a3d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_234.txt @@ -0,0 +1,3 @@ +Implement the forward pass +Having managed to correctly load the pretrained weights into the 🤗 Transformers implementation, you should now make +sure that the forward pass is correctly implemented. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_235.txt b/chunked/nltk_chunking/_add_new_model/chunk_235.txt new file mode 100644 index 0000000000000000000000000000000000000000..30e23c47db71ffa3cbe1f699221cc3771c6bd86a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_235.txt @@ -0,0 +1,2 @@ +In Get familiar with the original repository, you have already created a script that runs a forward +pass of the model using the original repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_236.txt b/chunked/nltk_chunking/_add_new_model/chunk_236.txt new file mode 100644 index 0000000000000000000000000000000000000000..ddcfd76da9ae3b63cc3e2363cee441daf70bdec8 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_236.txt @@ -0,0 +1,2 @@ +Now you should write an analogous script using the 🤗 Transformers +implementation instead of the original one. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_237.txt b/chunked/nltk_chunking/_add_new_model/chunk_237.txt new file mode 100644 index 0000000000000000000000000000000000000000..626918bd12377040030d8f280f8a63d567afdf64 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_237.txt @@ -0,0 +1,7 @@ +It should look as follows: +python +model = BrandNewBertModel.from_pretrained("/path/to/converted/checkpoint/folder") +input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19] +output = model(input_ids).last_hidden_states +It is very likely that the 🤗 Transformers implementation and the original model implementation don't give the exact +same output the very first time or that the forward pass throws an error. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_238.txt b/chunked/nltk_chunking/_add_new_model/chunk_238.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a5b19e4446f03d78d1d24f6c0a8d7ce0cbabef6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_238.txt @@ -0,0 +1 @@ +Don't be disappointed - it's expected! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_239.txt b/chunked/nltk_chunking/_add_new_model/chunk_239.txt new file mode 100644 index 0000000000000000000000000000000000000000..abf8cf50903c53c5677e859943d5aaef63a24356 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_239.txt @@ -0,0 +1,2 @@ +First, +you should make sure that the forward pass doesn't throw any errors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_24.txt b/chunked/nltk_chunking/_add_new_model/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..05a68c44a2e4174c92160ae74b9e1110b0a5267b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_24.txt @@ -0,0 +1,3 @@ +The important functionalities that are automatically provided to every new +model are [~PreTrainedModel.from_pretrained] and +[~PreTrainedModel.save_pretrained], which are used for serialization and deserialization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_240.txt b/chunked/nltk_chunking/_add_new_model/chunk_240.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fa295c30bb417769419298cb35f47ddc0cdca2a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_240.txt @@ -0,0 +1,2 @@ +It often happens that the wrong dimensions are +used leading to a Dimensionality mismatch error or that the wrong data type object is used, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_241.txt b/chunked/nltk_chunking/_add_new_model/chunk_241.txt new file mode 100644 index 0000000000000000000000000000000000000000..909a3ad17a6696f800f788d1f4bf60790d864c8d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_241.txt @@ -0,0 +1,2 @@ +torch.long +instead of torch.float32. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_242.txt b/chunked/nltk_chunking/_add_new_model/chunk_242.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca61e0a7c738258c9a08c63dfdc41cd6032974c6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_242.txt @@ -0,0 +1,2 @@ +Don't hesitate to ask the Hugging Face team for help, if you don't manage to solve +certain errors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_243.txt b/chunked/nltk_chunking/_add_new_model/chunk_243.txt new file mode 100644 index 0000000000000000000000000000000000000000..01faca3dd0f8ae00d92dab7c222332d85a54cea4 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_243.txt @@ -0,0 +1,2 @@ +The final part to make sure the 🤗 Transformers implementation works correctly is to ensure that the outputs are +equivalent to a precision of 1e-3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_244.txt b/chunked/nltk_chunking/_add_new_model/chunk_244.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6effcba5353b6ec2d32d9fbf93cf2347c7ee348 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_244.txt @@ -0,0 +1 @@ +First, you should ensure that the output shapes are identical, i.e. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_245.txt b/chunked/nltk_chunking/_add_new_model/chunk_245.txt new file mode 100644 index 0000000000000000000000000000000000000000..1aa05911a4fd18adf174881ea09eb414bbb47db1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_245.txt @@ -0,0 +1,2 @@ +outputs.shape should yield the same value for the script of the 🤗 Transformers implementation and the original +implementation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_246.txt b/chunked/nltk_chunking/_add_new_model/chunk_246.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dd7297d1308a98e0deb369bea95da28cef8105b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_246.txt @@ -0,0 +1 @@ +Next, you should make sure that the output values are identical as well. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_247.txt b/chunked/nltk_chunking/_add_new_model/chunk_247.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d3fc40c38cc699537526425d8ece302de2bdc2f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_247.txt @@ -0,0 +1,2 @@ +This one of the most difficult +parts of adding a new model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_248.txt b/chunked/nltk_chunking/_add_new_model/chunk_248.txt new file mode 100644 index 0000000000000000000000000000000000000000..abb27973d4ec6bab62185d830f1c469080719947 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_248.txt @@ -0,0 +1,3 @@ +Common mistakes why the outputs are not identical are: + +Some layers were not added, i.e. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_249.txt b/chunked/nltk_chunking/_add_new_model/chunk_249.txt new file mode 100644 index 0000000000000000000000000000000000000000..29846151180ddf99eb15759c9a74c3054dc75eeb --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_249.txt @@ -0,0 +1,4 @@ +an activation layer was not added, or the residual connection was forgotten +The word embedding matrix was not tied +The wrong positional embeddings are used because the original implementation uses on offset +Dropout is applied during the forward pass. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_25.txt b/chunked/nltk_chunking/_add_new_model/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfe7463acff0d8da2feb83f139cb3a7829b3b838 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_25.txt @@ -0,0 +1,3 @@ +All of the +other important functionalities, such as BrandNewBertModel.forward should be completely defined in the new +modeling_brand_new_bert.py script. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_250.txt b/chunked/nltk_chunking/_add_new_model/chunk_250.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3484284d0e30ca377dd87fa3b3baf6e8d344af5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_250.txt @@ -0,0 +1,2 @@ +To fix this make sure model.training is False and that no dropout + layer is falsely activated during the forward pass, i.e. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_251.txt b/chunked/nltk_chunking/_add_new_model/chunk_251.txt new file mode 100644 index 0000000000000000000000000000000000000000..01456c7daad170ab8b72568d132f471484aba0a7 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_251.txt @@ -0,0 +1,4 @@ +pass self.training to PyTorch's functional dropout + +The best way to fix the problem is usually to look at the forward pass of the original implementation and the 🤗 +Transformers implementation side-by-side and check if there are any differences. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_252.txt b/chunked/nltk_chunking/_add_new_model/chunk_252.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c7cb793391c36426086177da50c7bad82902740 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_252.txt @@ -0,0 +1,3 @@ +Ideally, you should debug/print out +intermediate outputs of both implementations of the forward pass to find the exact position in the network where the 🤗 +Transformers implementation shows a different output than the original implementation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_253.txt b/chunked/nltk_chunking/_add_new_model/chunk_253.txt new file mode 100644 index 0000000000000000000000000000000000000000..240bffb8d12a174eab6952c18dd5126e21328a28 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_253.txt @@ -0,0 +1,2 @@ +First, make sure that the +hard-coded input_ids in both scripts are identical. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_254.txt b/chunked/nltk_chunking/_add_new_model/chunk_254.txt new file mode 100644 index 0000000000000000000000000000000000000000..efbaf82259a73baeba7d288eb2ca5f91559c7e2f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_254.txt @@ -0,0 +1,2 @@ +Next, verify that the outputs of the first transformation of +the input_ids (usually the word embeddings) are identical. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_255.txt b/chunked/nltk_chunking/_add_new_model/chunk_255.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b398fd24dcfd95a3381d652992afaafd215fbe2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_255.txt @@ -0,0 +1,2 @@ +And then work your way up to the very last layer of the +network. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_256.txt b/chunked/nltk_chunking/_add_new_model/chunk_256.txt new file mode 100644 index 0000000000000000000000000000000000000000..19018ea36b94c0a8501964721285644e04282dae --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_256.txt @@ -0,0 +1,2 @@ +At some point, you will notice a difference between the two implementations, which should point you to the bug +in the 🤗 Transformers implementation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_257.txt b/chunked/nltk_chunking/_add_new_model/chunk_257.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd06bea2758467d734e97aba537ee3e5079458b4 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_257.txt @@ -0,0 +1,3 @@ +From our experience, a simple and efficient way is to add many print statements +in both the original implementation and 🤗 Transformers implementation, at the same positions in the network +respectively, and to successively remove print statements showing the same values for intermediate presentations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_258.txt b/chunked/nltk_chunking/_add_new_model/chunk_258.txt new file mode 100644 index 0000000000000000000000000000000000000000..01b5b4641aba2947f674d56c91f2b70a1153ca43 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_258.txt @@ -0,0 +1,2 @@ +When you're confident that both implementations yield the same output, verify the outputs with +torch.allclose(original_output, output, atol=1e-3), you're done with the most difficult part! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_259.txt b/chunked/nltk_chunking/_add_new_model/chunk_259.txt new file mode 100644 index 0000000000000000000000000000000000000000..525319d6662df773bc7d83e1c76fa3cb2c72c0bf --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_259.txt @@ -0,0 +1,2 @@ +Congratulations - the +work left to be done should be a cakewalk 😊. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_26.txt b/chunked/nltk_chunking/_add_new_model/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..17c8160d46b3707f5c16287a346031c0bc6adb02 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_26.txt @@ -0,0 +1,3 @@ +Next, we want to make sure that a model with a specific head layer, such as +BrandNewBertForMaskedLM does not inherit from BrandNewBertModel, but rather uses BrandNewBertModel +as a component that can be called in its forward pass to keep the level of abstraction low. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_260.txt b/chunked/nltk_chunking/_add_new_model/chunk_260.txt new file mode 100644 index 0000000000000000000000000000000000000000..e470f6393cd8e515f0e678f5d32a87898f8ca97e --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_260.txt @@ -0,0 +1 @@ +8. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_261.txt b/chunked/nltk_chunking/_add_new_model/chunk_261.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca0f2002b467b5c73629e70c7458d33dbe6fc65c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_261.txt @@ -0,0 +1,2 @@ +Adding all necessary model tests +At this point, you have successfully added a new model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_262.txt b/chunked/nltk_chunking/_add_new_model/chunk_262.txt new file mode 100644 index 0000000000000000000000000000000000000000..c61ab0c88f1db780d397ef1833d9ae31d5468827 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_262.txt @@ -0,0 +1,2 @@ +However, it is very much possible that the model does not yet +fully comply with the required design. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_263.txt b/chunked/nltk_chunking/_add_new_model/chunk_263.txt new file mode 100644 index 0000000000000000000000000000000000000000..becb1fb07e23b05d36963bd01b6680686677c37f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_263.txt @@ -0,0 +1,2 @@ +To make sure, the implementation is fully compatible with 🤗 Transformers, all +common tests should pass. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_264.txt b/chunked/nltk_chunking/_add_new_model/chunk_264.txt new file mode 100644 index 0000000000000000000000000000000000000000..11583083a6d9f1a14a7133a06481b9c5360a35b1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_264.txt @@ -0,0 +1,2 @@ +The Cookiecutter should have automatically added a test file for your model, probably under +the same tests/models/brand_new_bert/test_modeling_brand_new_bert.py. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_265.txt b/chunked/nltk_chunking/_add_new_model/chunk_265.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ff7d4b10d2eb0512e3ea3f607d2dd52a4138bc8 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_265.txt @@ -0,0 +1,8 @@ +Run this test file to verify that all common +tests pass: + +pytest tests/models/brand_new_bert/test_modeling_brand_new_bert.py +Having fixed all common tests, it is now crucial to ensure that all the nice work you have done is well tested, so that + +a) The community can easily understand your work by looking at specific tests of brand_new_bert +b) Future changes to your model will not break any important feature of the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_266.txt b/chunked/nltk_chunking/_add_new_model/chunk_266.txt new file mode 100644 index 0000000000000000000000000000000000000000..99a3c962bf6d1a694632c80be6dcccf277f718b7 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_266.txt @@ -0,0 +1 @@ +At first, integration tests should be added. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_267.txt b/chunked/nltk_chunking/_add_new_model/chunk_267.txt new file mode 100644 index 0000000000000000000000000000000000000000..f650680cf8dbc7f8699625cdc2f111ecc7de1e1f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_267.txt @@ -0,0 +1,2 @@ +Those integration tests essentially do the same as the debugging scripts +you used earlier to implement the model to 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_268.txt b/chunked/nltk_chunking/_add_new_model/chunk_268.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f2ac11652f49e188548610601957b507609bc93 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_268.txt @@ -0,0 +1,2 @@ +A template of those model tests has already added by the +Cookiecutter, called BrandNewBertModelIntegrationTests and only has to be filled out by you. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_269.txt b/chunked/nltk_chunking/_add_new_model/chunk_269.txt new file mode 100644 index 0000000000000000000000000000000000000000..e654211eb62a1a8ebab5bf44f923b22e6f2d0091 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_269.txt @@ -0,0 +1,9 @@ +To ensure that those +tests are passing, run + +RUN_SLOW=1 pytest -sv tests/models/brand_new_bert/test_modeling_brand_new_bert.py::BrandNewBertModelIntegrationTests + +In case you are using Windows, you should replace RUN_SLOW=1 with SET RUN_SLOW=1 + +Second, all features that are special to brand_new_bert should be tested additionally in a separate test under +BrandNewBertModelTester/`BrandNewBertModelTest. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_27.txt b/chunked/nltk_chunking/_add_new_model/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..88b7da65e0616a7d692a207a9b74fedf2ebe4f4c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_27.txt @@ -0,0 +1,2 @@ +Every new model requires a +configuration class, called BrandNewBertConfig. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_270.txt b/chunked/nltk_chunking/_add_new_model/chunk_270.txt new file mode 100644 index 0000000000000000000000000000000000000000..44543c5d3fbdd372959ff16a449275b46155b0a1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_270.txt @@ -0,0 +1,5 @@ +This part is often forgotten but is extremely useful in two +ways: + +It helps to transfer the knowledge you have acquired during the model addition to the community by showing how the + special features of brand_new_bert should work. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_271.txt b/chunked/nltk_chunking/_add_new_model/chunk_271.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5bdd31a1db587385da14218cb6f0042ba0954e0 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_271.txt @@ -0,0 +1 @@ +Future contributors can quickly test changes to the model by running those special tests. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_272.txt b/chunked/nltk_chunking/_add_new_model/chunk_272.txt new file mode 100644 index 0000000000000000000000000000000000000000..0aff809b3769023dead0b37953bdd25cdb5424f0 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_272.txt @@ -0,0 +1 @@ +9. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_273.txt b/chunked/nltk_chunking/_add_new_model/chunk_273.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ad7893e75a41834cede37b5042e052f0ccd7489 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_273.txt @@ -0,0 +1,2 @@ +Implement the tokenizer +Next, we should add the tokenizer of brand_new_bert. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_274.txt b/chunked/nltk_chunking/_add_new_model/chunk_274.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fea9ae7d9f496c098bc1a47138dead143b8bdcf --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_274.txt @@ -0,0 +1,2 @@ +Usually, the tokenizer is equivalent to or very similar to an +already existing tokenizer of 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_275.txt b/chunked/nltk_chunking/_add_new_model/chunk_275.txt new file mode 100644 index 0000000000000000000000000000000000000000..4059803e2788a3deda6ba4e18f4a685e0a846468 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_275.txt @@ -0,0 +1,2 @@ +It is very important to find/extract the original tokenizer file and to manage to load this file into the 🤗 +Transformers' implementation of the tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_276.txt b/chunked/nltk_chunking/_add_new_model/chunk_276.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f15774b425f6c06869d89ed32ae5657f9d8606d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_276.txt @@ -0,0 +1,2 @@ +To ensure that the tokenizer works correctly, it is recommended to first create a script in the original repository +that inputs a string and returns the `input_ids``. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_277.txt b/chunked/nltk_chunking/_add_new_model/chunk_277.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c2faf4a8cb30212f55523e2e513b7322b794af5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_277.txt @@ -0,0 +1,3 @@ +It could look similar to this (in pseudo-code): +python +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_278.txt b/chunked/nltk_chunking/_add_new_model/chunk_278.txt new file mode 100644 index 0000000000000000000000000000000000000000..80d30b3c7be95c2072776f4a8b9228c44517de93 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_278.txt @@ -0,0 +1,4 @@ +model = BrandNewBertModel.load_pretrained_checkpoint("/path/to/checkpoint/") +input_ids = model.tokenize(input_str) +You might have to take a deeper look again into the original repository to find the correct tokenizer function or you +might even have to do changes to your clone of the original repository to only output the input_ids. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_279.txt b/chunked/nltk_chunking/_add_new_model/chunk_279.txt new file mode 100644 index 0000000000000000000000000000000000000000..34c17bb5fd0d16297ca79a6a8a0c366c808ae198 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_279.txt @@ -0,0 +1,3 @@ +Having written +a functional tokenization script that uses the original repository, an analogous script for 🤗 Transformers should be +created. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_28.txt b/chunked/nltk_chunking/_add_new_model/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..782e453a5f39834e74b1d489dc993c98c259ea63 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_28.txt @@ -0,0 +1,8 @@ +This configuration is always stored as an attribute in +[PreTrainedModel], and thus can be accessed via the config attribute for all classes +inheriting from BrandNewBertPreTrainedModel: +python +model = BrandNewBertModel.from_pretrained("brandy/brand_new_bert") +model.config # model has access to its config +Similar to the model, the configuration inherits basic serialization and deserialization functionalities from +[PretrainedConfig]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_280.txt b/chunked/nltk_chunking/_add_new_model/chunk_280.txt new file mode 100644 index 0000000000000000000000000000000000000000..96c7d8a1ff9685303b226bc1ceafb5967ebd3d23 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_280.txt @@ -0,0 +1,4 @@ +It should look similar to this: +thon +from transformers import BrandNewBertTokenizer +input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words." \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_281.txt b/chunked/nltk_chunking/_add_new_model/chunk_281.txt new file mode 100644 index 0000000000000000000000000000000000000000..79c53e2edf5abb274de8ce730a346ec3786009df --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_281.txt @@ -0,0 +1,4 @@ +tokenizer = BrandNewBertTokenizer.from_pretrained("/path/to/tokenizer/folder/") +input_ids = tokenizer(input_str).input_ids + +When both input_ids yield the same values, as a final step a tokenizer test file should also be added. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_282.txt b/chunked/nltk_chunking/_add_new_model/chunk_282.txt new file mode 100644 index 0000000000000000000000000000000000000000..b11c99fcc271961d1c6f3f04cc04f5b83a4bd2a8 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_282.txt @@ -0,0 +1,2 @@ +Analogous to the modeling test files of brand_new_bert, the tokenization test files of brand_new_bert should +contain a couple of hard-coded integration tests. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_283.txt b/chunked/nltk_chunking/_add_new_model/chunk_283.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec4cedfd09d63025d16d23d232c2c6413b1edfb0 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_283.txt @@ -0,0 +1 @@ +10. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_284.txt b/chunked/nltk_chunking/_add_new_model/chunk_284.txt new file mode 100644 index 0000000000000000000000000000000000000000..f670dbee9b0127e83ded4eb2a59575fdad32446b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_284.txt @@ -0,0 +1,3 @@ +Run End-to-end integration tests +Having added the tokenizer, you should also add a couple of end-to-end integration tests using both the model and the +tokenizer to tests/models/brand_new_bert/test_modeling_brand_new_bert.py in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_285.txt b/chunked/nltk_chunking/_add_new_model/chunk_285.txt new file mode 100644 index 0000000000000000000000000000000000000000..576b7829932631d8740e1eab970ead3df881fce8 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_285.txt @@ -0,0 +1,2 @@ +Such a test should show on a meaningful +text-to-text sample that the 🤗 Transformers implementation works as expected. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_286.txt b/chunked/nltk_chunking/_add_new_model/chunk_286.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c9d187405eb4eb15e12fed384f0cdb33e6b58c6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_286.txt @@ -0,0 +1,2 @@ +A meaningful text-to-text sample can +include e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_287.txt b/chunked/nltk_chunking/_add_new_model/chunk_287.txt new file mode 100644 index 0000000000000000000000000000000000000000..85e7f48718c7b5f56b3efbfdea6ab761a1a4265f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_287.txt @@ -0,0 +1,2 @@ +a source-to-target-translation pair, an article-to-summary pair, a question-to-answer pair, etc… If none +of the ported checkpoints has been fine-tuned on a downstream task it is enough to simply rely on the model tests. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_288.txt b/chunked/nltk_chunking/_add_new_model/chunk_288.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4d287141bffbd0c1d4a6fde419ea785bf9384cf --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_288.txt @@ -0,0 +1,2 @@ +In a +final step to ensure that the model is fully functional, it is advised that you also run all tests on GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_289.txt b/chunked/nltk_chunking/_add_new_model/chunk_289.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0b4f3f9c70a931c3ea86c301cb2b17bd02beaa9 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_289.txt @@ -0,0 +1,3 @@ +It can +happen that you forgot to add some .to(self.device) statements to internal tensors of the model, which in such a +test would show in an error. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_29.txt b/chunked/nltk_chunking/_add_new_model/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e830144269a38032ba01947a7db2b2f312289bf --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_29.txt @@ -0,0 +1,2 @@ +Note that the configuration and the model are always serialized into two +different formats - the model to a pytorch_model.bin file and the configuration to a config.json file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_290.txt b/chunked/nltk_chunking/_add_new_model/chunk_290.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1120a876fa4559e9cdcdf350e134708324daf74 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_290.txt @@ -0,0 +1,2 @@ +In case you have no access to a GPU, the Hugging Face team can take care of running those +tests for you. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_291.txt b/chunked/nltk_chunking/_add_new_model/chunk_291.txt new file mode 100644 index 0000000000000000000000000000000000000000..d770ba86d9b4b4ab940ea8a80d504c91cae703d5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_291.txt @@ -0,0 +1 @@ +11. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_292.txt b/chunked/nltk_chunking/_add_new_model/chunk_292.txt new file mode 100644 index 0000000000000000000000000000000000000000..b77ae84df062e6542b78e150cee19af1242a4c5b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_292.txt @@ -0,0 +1,2 @@ +Add Docstring +Now, all the necessary functionality for brand_new_bert is added - you're almost done! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_293.txt b/chunked/nltk_chunking/_add_new_model/chunk_293.txt new file mode 100644 index 0000000000000000000000000000000000000000..0007cda45558597c16ecffb2bccee9146d667d48 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_293.txt @@ -0,0 +1,2 @@ +The only thing left to add is +a nice docstring and a doc page. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_294.txt b/chunked/nltk_chunking/_add_new_model/chunk_294.txt new file mode 100644 index 0000000000000000000000000000000000000000..550837612cce041bfbbb93466c3652dd58872aa1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_294.txt @@ -0,0 +1,2 @@ +The Cookiecutter should have added a template file called +docs/source/model_doc/brand_new_bert.md that you should fill out. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_295.txt b/chunked/nltk_chunking/_add_new_model/chunk_295.txt new file mode 100644 index 0000000000000000000000000000000000000000..40ca9582374de3c1ac953ff024a15b69d8b44e51 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_295.txt @@ -0,0 +1,2 @@ +Users of your model will usually first look at +this page before using your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_296.txt b/chunked/nltk_chunking/_add_new_model/chunk_296.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb90bc77075d3f8842bd017770a0475dd7ad9319 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_296.txt @@ -0,0 +1 @@ +Hence, the documentation must be understandable and concise. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_297.txt b/chunked/nltk_chunking/_add_new_model/chunk_297.txt new file mode 100644 index 0000000000000000000000000000000000000000..839648fbf3b8084acccd46f62f675478ce1e0b93 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_297.txt @@ -0,0 +1,2 @@ +It is very useful for +the community to add some Tips to show how the model should be used. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_298.txt b/chunked/nltk_chunking/_add_new_model/chunk_298.txt new file mode 100644 index 0000000000000000000000000000000000000000..bda19b5e4e04aaf6649859e72c9cd72929ea8df9 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_298.txt @@ -0,0 +1,2 @@ +Don't hesitate to ping the Hugging Face team +regarding the docstrings. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_299.txt b/chunked/nltk_chunking/_add_new_model/chunk_299.txt new file mode 100644 index 0000000000000000000000000000000000000000..090251aba1a7cc5cda60131c0647478ad63651d1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_299.txt @@ -0,0 +1,2 @@ +Next, make sure that the docstring added to src/transformers/models/brand_new_bert/modeling_brand_new_bert.py is +correct and included all necessary inputs and outputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_3.txt b/chunked/nltk_chunking/_add_new_model/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b2b391d54f95908e80ef07b0003c953b345d66c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_3.txt @@ -0,0 +1 @@ +At Hugging Face, we're trying to empower more of the community to actively add models and we've put together this guide to walk you through the process of adding a PyTorch model (make sure you have PyTorch installed). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_30.txt b/chunked/nltk_chunking/_add_new_model/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..18c3130c77af217e8d4ae1c7ba89447e3137e297 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_30.txt @@ -0,0 +1,3 @@ +Calling +[~PreTrainedModel.save_pretrained] will automatically call +[~PretrainedConfig.save_pretrained], so that both model and configuration are saved. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_300.txt b/chunked/nltk_chunking/_add_new_model/chunk_300.txt new file mode 100644 index 0000000000000000000000000000000000000000..24ce4343e65b38febcfb07c7c6464f0df506a3df --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_300.txt @@ -0,0 +1 @@ +We have a detailed guide about writing documentation and our docstring format here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_301.txt b/chunked/nltk_chunking/_add_new_model/chunk_301.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2747691572eeac607aa5cb4600c5f8d071d8b3c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_301.txt @@ -0,0 +1,3 @@ +It is always to good to remind oneself that documentation should +be treated at least as carefully as the code in 🤗 Transformers since the documentation is usually the first contact +point of the community with the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_302.txt b/chunked/nltk_chunking/_add_new_model/chunk_302.txt new file mode 100644 index 0000000000000000000000000000000000000000..0335df8d74029af190951950fc5c373aa6e37e13 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_302.txt @@ -0,0 +1,2 @@ +Code refactor +Great, now you have added all the necessary code for brand_new_bert. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_303.txt b/chunked/nltk_chunking/_add_new_model/chunk_303.txt new file mode 100644 index 0000000000000000000000000000000000000000..f82861a38fcb62f171edcb71cb2bac0ee8455cf3 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_303.txt @@ -0,0 +1,9 @@ +At this point, you should correct some potential +incorrect code style by running: + +make style +and verify that your coding style passes the quality check: + +make quality +There are a couple of other very strict design tests in 🤗 Transformers that might still be failing, which shows up in +the tests of your pull request. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_304.txt b/chunked/nltk_chunking/_add_new_model/chunk_304.txt new file mode 100644 index 0000000000000000000000000000000000000000..53d00a917c0ef44d2527885bb0d5011b1a7a203a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_304.txt @@ -0,0 +1,2 @@ +This is often because of some missing information in the docstring or some incorrect +naming. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_305.txt b/chunked/nltk_chunking/_add_new_model/chunk_305.txt new file mode 100644 index 0000000000000000000000000000000000000000..c47f0f8aa43cfbba6d8676f8585f4bbe7d0ba59b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_305.txt @@ -0,0 +1 @@ +The Hugging Face team will surely help you if you're stuck here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_306.txt b/chunked/nltk_chunking/_add_new_model/chunk_306.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf0efee1ef1148c68368de450bc1ea3f7b92ef5e --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_306.txt @@ -0,0 +1 @@ +Lastly, it is always a good idea to refactor one's code after having ensured that the code works correctly. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_307.txt b/chunked/nltk_chunking/_add_new_model/chunk_307.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e7cd7962f82eda0b79132cbc7ab2007a0e0c884 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_307.txt @@ -0,0 +1,2 @@ +With all +tests passing, now it's a good time to go over the added code again and do some refactoring. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_308.txt b/chunked/nltk_chunking/_add_new_model/chunk_308.txt new file mode 100644 index 0000000000000000000000000000000000000000..4dd82cdfa8da6028f9a94355141ac8611effc097 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_308.txt @@ -0,0 +1 @@ +You have now finished the coding part, congratulation! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_309.txt b/chunked/nltk_chunking/_add_new_model/chunk_309.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ae67e365ea99cef60ea2f43cc5f12c5eede2af2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_309.txt @@ -0,0 +1 @@ +🎉 You are Awesome! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_31.txt b/chunked/nltk_chunking/_add_new_model/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..54cd98684ac5cb18b977e3e507d997fb3d9dbe6d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_31.txt @@ -0,0 +1,6 @@ +Code style +When coding your new model, keep in mind that Transformers is an opinionated library and we have a few quirks of our +own regarding how code should be written :-) + +The forward pass of your model should be fully written in the modeling file while being fully independent of other + models in the library. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_310.txt b/chunked/nltk_chunking/_add_new_model/chunk_310.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e2a219c84c629d5b0483a57052df69483e78740 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_310.txt @@ -0,0 +1,2 @@ +😎 +12. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_311.txt b/chunked/nltk_chunking/_add_new_model/chunk_311.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d0de9e0fd0c1ef31e84ff2079cdd8c4a753b5ca --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_311.txt @@ -0,0 +1,3 @@ +Upload the models to the model hub +In this final part, you should convert and upload all checkpoints to the model hub and add a model card for each +uploaded model checkpoint. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_312.txt b/chunked/nltk_chunking/_add_new_model/chunk_312.txt new file mode 100644 index 0000000000000000000000000000000000000000..713b52a8adffb53f3ee568525723228a61f0a0e1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_312.txt @@ -0,0 +1 @@ +You can get familiar with the hub functionalities by reading our Model sharing and uploading Page. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_313.txt b/chunked/nltk_chunking/_add_new_model/chunk_313.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a05a8962d32114caaf06f5b2c02be27475636fb --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_313.txt @@ -0,0 +1,3 @@ +You should work alongside the Hugging Face team here to decide on a fitting name for each +checkpoint and to get the required access rights to be able to upload the model under the author's organization of +brand_new_bert. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_314.txt b/chunked/nltk_chunking/_add_new_model/chunk_314.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8984ec7ba734d22440a1b14d01565fc8b0c40d8 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_314.txt @@ -0,0 +1 @@ +The push_to_hub method, present in all models in transformers, is a quick and efficient way to push your checkpoint to the hub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_315.txt b/chunked/nltk_chunking/_add_new_model/chunk_315.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0f3538920f8862878c0b7135a3e88afe0c659fd --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_315.txt @@ -0,0 +1,4 @@ +A little snippet is pasted below: +thon +brand_new_bert.push_to_hub("brand_new_bert") +Uncomment the following line to push to an organization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_316.txt b/chunked/nltk_chunking/_add_new_model/chunk_316.txt new file mode 100644 index 0000000000000000000000000000000000000000..8818fc085a549b469675446fa5e97c03229fee41 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_316.txt @@ -0,0 +1,3 @@ +brand_new_bert.push_to_hub("/brand_new_bert") + +It is worth spending some time to create fitting model cards for each checkpoint. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_317.txt b/chunked/nltk_chunking/_add_new_model/chunk_317.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0729992d1d617bcbb0de0d1112a0ee49359efad --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_317.txt @@ -0,0 +1,2 @@ +The model cards should highlight the +specific characteristics of this particular checkpoint, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_318.txt b/chunked/nltk_chunking/_add_new_model/chunk_318.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b3301e48f5ab92927b3f3eba3658a884c6f813e --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_318.txt @@ -0,0 +1,2 @@ +On which dataset was the checkpoint +pretrained/fine-tuned on? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_319.txt b/chunked/nltk_chunking/_add_new_model/chunk_319.txt new file mode 100644 index 0000000000000000000000000000000000000000..d609335bc61db1e073725c33485200ddcb646eb0 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_319.txt @@ -0,0 +1 @@ +On what down-stream task should the model be used? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_32.txt b/chunked/nltk_chunking/_add_new_model/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..da9fb8d17ffbd6fcfc5f6df9a5f846836c85220b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_32.txt @@ -0,0 +1,3 @@ +If you want to reuse a block from another model, copy the code and paste it with a + # Copied from comment on top (see here + for a good example and there for more documentation on Copied from). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_320.txt b/chunked/nltk_chunking/_add_new_model/chunk_320.txt new file mode 100644 index 0000000000000000000000000000000000000000..002db8d48b027fed584b8ced9486c068ffb893f0 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_320.txt @@ -0,0 +1,2 @@ +And also include some code on how to +correctly use the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_321.txt b/chunked/nltk_chunking/_add_new_model/chunk_321.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4b697a804065352582ff23c4c6d95ea1a00c785 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_321.txt @@ -0,0 +1 @@ +13. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_322.txt b/chunked/nltk_chunking/_add_new_model/chunk_322.txt new file mode 100644 index 0000000000000000000000000000000000000000..59226f52d44cc97273c6fb0263be028b9d2e965c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_322.txt @@ -0,0 +1,3 @@ +(Optional) Add notebook +It is very helpful to add a notebook that showcases in-detail how brand_new_bert can be used for inference and/or +fine-tuned on a downstream task. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_323.txt b/chunked/nltk_chunking/_add_new_model/chunk_323.txt new file mode 100644 index 0000000000000000000000000000000000000000..a03d904ea000d631e02f9fb7002b0d5e50827e8d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_323.txt @@ -0,0 +1 @@ +This is not mandatory to merge your PR, but very useful for the community. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_324.txt b/chunked/nltk_chunking/_add_new_model/chunk_324.txt new file mode 100644 index 0000000000000000000000000000000000000000..018c4e0bccf367bef6943eac1440ba0e3742f713 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_324.txt @@ -0,0 +1 @@ +14. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_325.txt b/chunked/nltk_chunking/_add_new_model/chunk_325.txt new file mode 100644 index 0000000000000000000000000000000000000000..85e2c0109c0800dc999b3c7cf1e109a38c7d7480 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_325.txt @@ -0,0 +1,2 @@ +Submit your finished PR +You're done programming now and can move to the last step, which is getting your PR merged into main. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_326.txt b/chunked/nltk_chunking/_add_new_model/chunk_326.txt new file mode 100644 index 0000000000000000000000000000000000000000..ceda2b4504743e0907f6a687388807ee544e314c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_326.txt @@ -0,0 +1,4 @@ +Usually, the +Hugging Face team should have helped you already at this point, but it is worth taking some time to give your finished +PR a nice description and eventually add comments to your code, if you want to point out certain design choices to your +reviewer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_327.txt b/chunked/nltk_chunking/_add_new_model/chunk_327.txt new file mode 100644 index 0000000000000000000000000000000000000000..5369adc51be9fc1893ad645f60485ff996ee8fc7 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_327.txt @@ -0,0 +1 @@ +Share your work!! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_328.txt b/chunked/nltk_chunking/_add_new_model/chunk_328.txt new file mode 100644 index 0000000000000000000000000000000000000000..204024c8e395de52eeb28567f7604b836b1fd52c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_328.txt @@ -0,0 +1 @@ +Now, it's time to get some credit from the community for your work! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_329.txt b/chunked/nltk_chunking/_add_new_model/chunk_329.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c6ccb722d86854474075977b3248627df8aa345 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_329.txt @@ -0,0 +1,2 @@ +Having completed a model addition is a major +contribution to Transformers and the whole NLP community. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_33.txt b/chunked/nltk_chunking/_add_new_model/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef38a236be99c896ce419aef852b18bcf47a362c --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_33.txt @@ -0,0 +1 @@ +The code should be fully understandable, even by a non-native English speaker. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_330.txt b/chunked/nltk_chunking/_add_new_model/chunk_330.txt new file mode 100644 index 0000000000000000000000000000000000000000..167331c483c14e0872de2c70e95386e1cd39bbe1 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_330.txt @@ -0,0 +1,2 @@ +Your code and the ported pre-trained models will certainly be +used by hundreds and possibly even thousands of developers and researchers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_331.txt b/chunked/nltk_chunking/_add_new_model/chunk_331.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e92e1f3e471c5af9219872646f5bf16007a51cf --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_331.txt @@ -0,0 +1,2 @@ +You should be proud of your work and share +your achievements with the community. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_332.txt b/chunked/nltk_chunking/_add_new_model/chunk_332.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd48cad6b4bb422158574de2de945cb9c3eb149f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_332.txt @@ -0,0 +1 @@ +You have made another model that is super easy to access for everyone in the community! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_333.txt b/chunked/nltk_chunking/_add_new_model/chunk_333.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ae2b406f4187acd1551693a6a5775cfabc54ff4 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_333.txt @@ -0,0 +1 @@ +🤯 \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_34.txt b/chunked/nltk_chunking/_add_new_model/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4599bde70e3dd266270310890020e0962a1c1cb --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_34.txt @@ -0,0 +1,2 @@ +This means you should pick + descriptive variable names and avoid abbreviations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_35.txt b/chunked/nltk_chunking/_add_new_model/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4fd49cbec15a428429c2108bdb436e06088938e --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_35.txt @@ -0,0 +1 @@ +As an example, activation is preferred to act. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_36.txt b/chunked/nltk_chunking/_add_new_model/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..62bbfd863a986e12c6063d78aa8014de118d6a9a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_36.txt @@ -0,0 +1 @@ +One-letter variable names are strongly discouraged unless it's an index in a for loop. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_37.txt b/chunked/nltk_chunking/_add_new_model/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..afafde7ae629a90a8b2b9091409b95a69d3ff3c5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_37.txt @@ -0,0 +1 @@ +More generally we prefer longer explicit code to short magical one. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_38.txt b/chunked/nltk_chunking/_add_new_model/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5e9e0abf597eeb1e3742f02a873f3fa554dea40 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_38.txt @@ -0,0 +1,2 @@ +Avoid subclassing nn.Sequential in PyTorch but subclass nn.Module and write the forward pass, so that anyone + using your code can quickly debug it by adding print statements or breaking points. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_39.txt b/chunked/nltk_chunking/_add_new_model/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..16e595b844686901031c9d2b2f3c329d30031724 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_39.txt @@ -0,0 +1 @@ +Your function signature should be type-annotated. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_4.txt b/chunked/nltk_chunking/_add_new_model/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..5385fe7e91d84ec1b5377bd83017a185f8e9c4b2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_4.txt @@ -0,0 +1 @@ +If you're interested in implementing a TensorFlow model, take a look at the How to convert a 🤗 Transformers model to TensorFlow guide! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_40.txt b/chunked/nltk_chunking/_add_new_model/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..f06f49b91ce6ee5cf364555a8f7ed302445788a3 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_40.txt @@ -0,0 +1,2 @@ +For the rest, good variable names are way more readable and + understandable than type annotations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_41.txt b/chunked/nltk_chunking/_add_new_model/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..55b8052355173577f2457bc0933f7f14ff0b12ef --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_41.txt @@ -0,0 +1,2 @@ +Overview of tokenizers +Not quite ready yet :-( This section will be added soon! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_42.txt b/chunked/nltk_chunking/_add_new_model/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..a70881d2ec0fbde19021e3e79acf37bb6c442fca --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_42.txt @@ -0,0 +1,3 @@ +Step-by-step recipe to add a model to 🤗 Transformers +Everyone has different preferences of how to port a model so it can be very helpful for you to take a look at summaries +of how other contributors ported models to Hugging Face. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_43.txt b/chunked/nltk_chunking/_add_new_model/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..f82a13346c3819dfee96d7b2e14e6c11a80f7d93 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_43.txt @@ -0,0 +1,8 @@ +Here is a list of community blog posts on how to port a model: + +Porting GPT2 Model by Thomas +Porting WMT19 MT Model by Stas + +From experience, we can tell you that the most important things to keep in mind when adding a model are: + +Don't reinvent the wheel! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_44.txt b/chunked/nltk_chunking/_add_new_model/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb5fea12b20fba72cfbdb7481c752d645d976ba5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_44.txt @@ -0,0 +1,2 @@ +Most parts of the code you will add for the new 🤗 Transformers model already exist + somewhere in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_45.txt b/chunked/nltk_chunking/_add_new_model/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec2b0cac91c4427411d5ffc87c7113b72b319aab --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_45.txt @@ -0,0 +1,2 @@ +Take some time to find similar, already existing models and tokenizers you can copy + from. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_46.txt b/chunked/nltk_chunking/_add_new_model/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6db382fb629b96388dfeae5ec5302cb0c9d9ec9 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_46.txt @@ -0,0 +1,2 @@ +grep and rg are your + friends. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_47.txt b/chunked/nltk_chunking/_add_new_model/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..1aefffa0188f08cd2c5f2a925cfba2dc15ca33fd --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_47.txt @@ -0,0 +1,2 @@ +Note that it might very well happen that your model's tokenizer is based on one model implementation, and + your model's modeling code on another one. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_48.txt b/chunked/nltk_chunking/_add_new_model/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c6c42b6981712ccfb35860e2422a687ff0e1372 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_48.txt @@ -0,0 +1 @@ +E.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_49.txt b/chunked/nltk_chunking/_add_new_model/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..468823e3c9bccfa8a5349f0abfe2ffbb2190c02a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_49.txt @@ -0,0 +1,2 @@ +FSMT's modeling code is based on BART, while FSMT's tokenizer code + is based on XLM. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_5.txt b/chunked/nltk_chunking/_add_new_model/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a7c3111c94989237d25085d1fec851de32b8f90 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_5.txt @@ -0,0 +1,8 @@ +Along the way, you'll: + +get insights into open-source best practices +understand the design principles behind one of the most popular deep learning libraries +learn how to efficiently test large models +learn how to integrate Python utilities like black, ruff, and make fix-copies to ensure clean and readable code + +A Hugging Face team member will be available to help you along the way so you'll never be alone. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_50.txt b/chunked/nltk_chunking/_add_new_model/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e43e992250b2be5787f2983fb1d0628d9ad90e7 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_50.txt @@ -0,0 +1 @@ +It's more of an engineering challenge than a scientific challenge. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_51.txt b/chunked/nltk_chunking/_add_new_model/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ec0ed5abb65c7c89c1d46152fc6f28e264fd93d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_51.txt @@ -0,0 +1,2 @@ +You should spend more time creating an + efficient debugging environment rather than trying to understand all theoretical aspects of the model in the paper. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_52.txt b/chunked/nltk_chunking/_add_new_model/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a2bc2fa4967236b438b73d09e7e3c581422eafc --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_52.txt @@ -0,0 +1 @@ +Ask for help, when you're stuck! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_53.txt b/chunked/nltk_chunking/_add_new_model/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..889c1b58614c8474e513a95f73eecbf1ead90b1d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_53.txt @@ -0,0 +1,2 @@ +Models are the core component of 🤗 Transformers so we at Hugging Face are more + than happy to help you at every step to add your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_54.txt b/chunked/nltk_chunking/_add_new_model/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..34d3ba37b5a0fe1ba6a144c872a3df98058eb683 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_54.txt @@ -0,0 +1,2 @@ +Don't hesitate to ask if you notice you are not making + progress. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_55.txt b/chunked/nltk_chunking/_add_new_model/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..7798e0fd9bee2c648b18c321b90c24aaedcf0509 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_55.txt @@ -0,0 +1 @@ +In the following, we try to give you a general recipe that we found most useful when porting a model to 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_56.txt b/chunked/nltk_chunking/_add_new_model/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdbf40c0fbcda578418b835b4eaa9280d32307c5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_56.txt @@ -0,0 +1,17 @@ +The following list is a summary of everything that has to be done to add a model and can be used by you as a To-Do +List: +☠(Optional) Understood the model's theoretical aspects +☠Prepared 🤗 Transformers dev environment +☠Set up debugging environment of the original repository +☠Created script that successfully runs the forward() pass using the original repository and checkpoint +☠Successfully added the model skeleton to 🤗 Transformers +☠Successfully converted original checkpoint to 🤗 Transformers checkpoint +☠Successfully ran forward() pass in 🤗 Transformers that gives identical output to original checkpoint +☠Finished model tests in 🤗 Transformers +☠Successfully added tokenizer in 🤗 Transformers +☠Run end-to-end integration tests +☠Finished docs +☠Uploaded model weights to the Hub +☠Submitted the pull request +☠(Optional) Added a demo notebook +To begin with, we usually recommend starting by getting a good theoretical understanding of BrandNewBert. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_57.txt b/chunked/nltk_chunking/_add_new_model/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..883d08d4825bc375f7d3776030b650a5d9b7b19f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_57.txt @@ -0,0 +1,3 @@ +However, +if you prefer to understand the theoretical aspects of the model on-the-job, then it is totally fine to directly dive +into the BrandNewBert's code-base. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_58.txt b/chunked/nltk_chunking/_add_new_model/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..d23764fb03d6cb05bfc675a0248576096f3ebefa --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_58.txt @@ -0,0 +1,3 @@ +This option might suit you better if your engineering skills are better than +your theoretical skill, if you have trouble understanding BrandNewBert's paper, or if you just enjoy programming +much more than reading scientific papers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_59.txt b/chunked/nltk_chunking/_add_new_model/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee85c37add9a5c3b9e680b0a73a2d3d39ca24f91 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_59.txt @@ -0,0 +1 @@ +1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_6.txt b/chunked/nltk_chunking/_add_new_model/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..0896cd20bed669322f0357b53dc64a477caa5e5f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_6.txt @@ -0,0 +1,2 @@ +🤗 â¤ï¸ +To get started, open a New model addition issue for the model you want to see in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_60.txt b/chunked/nltk_chunking/_add_new_model/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..9784fa877c0d75d0f4404304827f785a5d94b9ee --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_60.txt @@ -0,0 +1,2 @@ +(Optional) Theoretical aspects of BrandNewBert +You should take some time to read BrandNewBert's paper, if such descriptive work exists. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_61.txt b/chunked/nltk_chunking/_add_new_model/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd4be4688023f1f2e3249bdb9bea1fd5c6fd6fe4 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_61.txt @@ -0,0 +1,2 @@ +There might be large +sections of the paper that are difficult to understand. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_62.txt b/chunked/nltk_chunking/_add_new_model/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fb9943803b4b684b35e49675ea115a4a755b52b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_62.txt @@ -0,0 +1 @@ +If this is the case, this is fine - don't worry! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_63.txt b/chunked/nltk_chunking/_add_new_model/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..61bf8575887884657867413573a6bb70c8cb7cfc --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_63.txt @@ -0,0 +1,3 @@ +The goal is +not to get a deep theoretical understanding of the paper, but to extract the necessary information required to +effectively re-implement the model in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_64.txt b/chunked/nltk_chunking/_add_new_model/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bc543eee84033b5e3619e7cedce791b10335cfa --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_64.txt @@ -0,0 +1,4 @@ +That being said, you don't have to spend too much time on the +theoretical aspects, but rather focus on the practical ones, namely: + +What type of model is brand_new_bert? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_65.txt b/chunked/nltk_chunking/_add_new_model/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7d8e0f096b019219c31ada10824a437491f8721 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_65.txt @@ -0,0 +1 @@ +BERT-like encoder-only model? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_66.txt b/chunked/nltk_chunking/_add_new_model/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..3700676cbff0d3a6e64dac5657bd3d1fbdc64ebf --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_66.txt @@ -0,0 +1 @@ +GPT2-like decoder-only model? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_67.txt b/chunked/nltk_chunking/_add_new_model/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd65e504ec802127e1bb90795a0c01c9aa3b06bf --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_67.txt @@ -0,0 +1,2 @@ +BART-like + encoder-decoder model? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_68.txt b/chunked/nltk_chunking/_add_new_model/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..215bc600a353207c85e16022ed9d8556f9bfff3a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_68.txt @@ -0,0 +1 @@ +Look at the model_summary if you're not familiar with the differences between those. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_69.txt b/chunked/nltk_chunking/_add_new_model/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a51c9742906f788c7c323ad9e6774d5db05f570 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_69.txt @@ -0,0 +1 @@ +What are the applications of brand_new_bert? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_7.txt b/chunked/nltk_chunking/_add_new_model/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..624fc8e47c8b5f31b64ef6fbd067ae8aa8ab7843 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_7.txt @@ -0,0 +1 @@ +If you're not especially picky about contributing a specific model, you can filter by the New model label to see if there are any unclaimed model requests and work on it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_70.txt b/chunked/nltk_chunking/_add_new_model/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..70fcc4d98b5d25fd02782612637d59615b393951 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_70.txt @@ -0,0 +1 @@ +Text classification? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_71.txt b/chunked/nltk_chunking/_add_new_model/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbebed75ba1c8d60b011071771771bfcf44e4ef2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_71.txt @@ -0,0 +1 @@ +Text generation? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_72.txt b/chunked/nltk_chunking/_add_new_model/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..5011a29aa3c72eabab5e8ba7526e481f59ad0674 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_72.txt @@ -0,0 +1,2 @@ +Seq2Seq tasks, e.g., + summarization? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_73.txt b/chunked/nltk_chunking/_add_new_model/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..246b349791df9dbb72581ed5e7fa2fcdb5dbc2a2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_73.txt @@ -0,0 +1 @@ +What is the novel feature of the model that makes it different from BERT/GPT-2/BART? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_74.txt b/chunked/nltk_chunking/_add_new_model/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..99c9aff9ed56d469d6bdb03be5130aa563319242 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_74.txt @@ -0,0 +1,2 @@ +Which of the already existing 🤗 Transformers models is most + similar to brand_new_bert? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_75.txt b/chunked/nltk_chunking/_add_new_model/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..647b78b1de0d36babd1394790dc94ccce8afc252 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_75.txt @@ -0,0 +1 @@ +What type of tokenizer is used? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_76.txt b/chunked/nltk_chunking/_add_new_model/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..73b2e7167cc8348f915a259547c79dbe7eaa195e --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_76.txt @@ -0,0 +1 @@ +A sentencepiece tokenizer? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_77.txt b/chunked/nltk_chunking/_add_new_model/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..062382bc814f7d991e3a8f7cca48754c31277c8a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_77.txt @@ -0,0 +1 @@ +Word piece tokenizer? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_78.txt b/chunked/nltk_chunking/_add_new_model/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..981fd98cfebd09a5944d97f3886fc36b1111de39 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_78.txt @@ -0,0 +1,2 @@ +Is it the same tokenizer as used + for BERT or BART? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_79.txt b/chunked/nltk_chunking/_add_new_model/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc46e3c452d16c722db84c563bc78b1dfa081823 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_79.txt @@ -0,0 +1,2 @@ +After you feel like you have gotten a good overview of the architecture of the model, you might want to write to the +Hugging Face team with any questions you might have. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_8.txt b/chunked/nltk_chunking/_add_new_model/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee3939e76a36ea21a4b638e6ee6077c83841cb34 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_8.txt @@ -0,0 +1 @@ +Once you've opened a new model request, the first step is to get familiar with 🤗 Transformers if you aren't already! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_80.txt b/chunked/nltk_chunking/_add_new_model/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..d71c7c3e833db27770218bc6fb7558e9fdca7389 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_80.txt @@ -0,0 +1,2 @@ +This might include questions regarding the model's architecture, +its attention layer, etc. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_81.txt b/chunked/nltk_chunking/_add_new_model/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..d93fb5b16b676106f6a9d37f97360d3bd07c138e --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_81.txt @@ -0,0 +1 @@ +We will be more than happy to help you. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_82.txt b/chunked/nltk_chunking/_add_new_model/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_82.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_83.txt b/chunked/nltk_chunking/_add_new_model/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b51be162484a502a7eb2f64b04c25eb2ef5a02f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_83.txt @@ -0,0 +1,4 @@ +Next prepare your environment + +Fork the repository by clicking on the ‘Fork' button on the + repository's page. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_84.txt b/chunked/nltk_chunking/_add_new_model/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..18667b7524d4e404b0dfd64d346e2b36dc306359 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_84.txt @@ -0,0 +1 @@ +This creates a copy of the code under your GitHub user account. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_85.txt b/chunked/nltk_chunking/_add_new_model/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ad8cb9c836242cc381455c53766535cf0ed1fd9 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_85.txt @@ -0,0 +1,11 @@ +Clone your transformers fork to your local disk, and add the base repository as a remote: + +git clone https://github.com/[your Github handle]/transformers.git +cd transformers +git remote add upstream https://github.com/huggingface/transformers.git + +Set up a development environment, for instance by running the following command: + +python -m venv .env +source .env/bin/activate +pip install -e ". \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_86.txt b/chunked/nltk_chunking/_add_new_model/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..7077265d5a98371e491ed533e396745e2c8849e5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_86.txt @@ -0,0 +1,3 @@ +[dev]" +Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a +failure with this command. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_87.txt b/chunked/nltk_chunking/_add_new_model/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..cdc49f051186cde05ea6db289c6d163e9e3fc008 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_87.txt @@ -0,0 +1,4 @@ +If that's the case make sure to install the Deep Learning framework you are working with +(PyTorch, TensorFlow and/or Flax) then do: + +pip install -e ". \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_88.txt b/chunked/nltk_chunking/_add_new_model/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..684404facd80f6b44f5f3881038740eeac7947ca --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_88.txt @@ -0,0 +1,2 @@ +[quality]" +which should be enough for most use cases. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_89.txt b/chunked/nltk_chunking/_add_new_model/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..29c7ed363ae27b828b9dad3534af928d42beb926 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_89.txt @@ -0,0 +1,5 @@ +You can then return to the parent directory + +cd .. + +We recommend adding the PyTorch version of brand_new_bert to Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_9.txt b/chunked/nltk_chunking/_add_new_model/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..3331084467b44ddce64bab090cbd639bca170efd --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_9.txt @@ -0,0 +1,2 @@ +General overview of 🤗 Transformers +First, you should get a general overview of 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_90.txt b/chunked/nltk_chunking/_add_new_model/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe790fbdc935c86ea8d379f4dd35dcbb67543391 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_90.txt @@ -0,0 +1,2 @@ +To install PyTorch, please follow the + instructions on https://pytorch.org/get-started/locally/. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_91.txt b/chunked/nltk_chunking/_add_new_model/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..60b2aaf191c125a8e34f8d1ebf9ddd51a5f8104f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_91.txt @@ -0,0 +1 @@ +Note: You don't need to have CUDA installed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_92.txt b/chunked/nltk_chunking/_add_new_model/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..14c793419bb9a549ab3a1a132edcbaba9ce6958d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_92.txt @@ -0,0 +1 @@ +Making the new model work on CPU is sufficient. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_93.txt b/chunked/nltk_chunking/_add_new_model/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..54abb29f97c12064a119b6c725b135e331eb3c34 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_93.txt @@ -0,0 +1,5 @@ +To port brand_new_bert, you will also need access to its original repository: + +git clone https://github.com/org_that_created_brand_new_bert_org/brand_new_bert.git +cd brand_new_bert +pip install -e . \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_94.txt b/chunked/nltk_chunking/_add_new_model/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..d376955c74d5bc610e085401a10fe1f96ec1c62a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_94.txt @@ -0,0 +1 @@ +Now you have set up a development environment to port brand_new_bert to 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_95.txt b/chunked/nltk_chunking/_add_new_model/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbba075a533f80b8236c9119828bccb50df02b90 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_95.txt @@ -0,0 +1 @@ +3.-4. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_96.txt b/chunked/nltk_chunking/_add_new_model/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfb3461c672be6557f01e97507a2b0f9759b3237 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_96.txt @@ -0,0 +1,2 @@ +Run a pretrained checkpoint using the original repository +At first, you will work on the original brand_new_bert repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_97.txt b/chunked/nltk_chunking/_add_new_model/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..386aa8ba46f88a64029af9ed2141dc17a5226b65 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_97.txt @@ -0,0 +1,2 @@ +Often, the original implementation is very +“researchyâ€. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_98.txt b/chunked/nltk_chunking/_add_new_model/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca432b8ba52f03a1c047fb6814a4e41eee78633f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_98.txt @@ -0,0 +1 @@ +Meaning that documentation might be lacking and the code can be difficult to understand. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_model/chunk_99.txt b/chunked/nltk_chunking/_add_new_model/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6884230015955212286dc904f37426c15f56111 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_model/chunk_99.txt @@ -0,0 +1,2 @@ +But this should +be exactly your motivation to reimplement brand_new_bert. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_0.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0596f0bb4f8a7238d60a21f4a68bd14ed67640b --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_0.txt @@ -0,0 +1,2 @@ + +How to create a custom pipeline? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_1.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a68b1d234ab38b2efbaf41fb202e641ec192d54 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_1.txt @@ -0,0 +1,2 @@ +In this guide, we will see how to create a custom pipeline and share it on the Hub or add it to the +🤗 Transformers library. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_10.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..23f20b0dc15443db787e252979e4fec1c1887177 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_10.txt @@ -0,0 +1,2 @@ +Start by inheriting the base class Pipeline with the 4 methods needed to implement preprocess, +_forward, postprocess, and _sanitize_parameters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_11.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d58b59cb6927103c9014b15d9f926d438317188 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_11.txt @@ -0,0 +1,25 @@ +thon +from transformers import Pipeline +class MyPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] + return preprocess_kwargs, {}, {} +def preprocess(self, inputs, maybe_arg=2): + model_input = Tensor(inputs["input_ids"]) + return {"model_input": model_input} + +def _forward(self, model_inputs): + # model_inputs == {"model_input": model_input} + outputs = self.model(**model_inputs) + # Maybe {"logits": Tensor()} + return outputs + +def postprocess(self, model_outputs): + best_class = model_outputs["logits"].softmax(-1) + return best_class + +The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing +pre/postprocessing on the CPU on different threads +preprocess will take the originally defined inputs, and turn them into something feedable to the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_12.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6d3eab994102f45acf7b4e8d801ee933c97fe49 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_12.txt @@ -0,0 +1,2 @@ +It might +contain more information and is usually a Dict. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_13.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..2707021a9334704a41ec837dcac6e2b8a1d0dae9 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_13.txt @@ -0,0 +1 @@ +_forward is the implementation detail and is not meant to be called directly. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_14.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b6ef8234fef9022f62a9f9ca7f2a676404ad32d --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_14.txt @@ -0,0 +1,2 @@ +forward is the preferred +called method as it contains safeguards to make sure everything is working on the expected device. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_15.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..33045cc9ed735b7c9d8f095e78eb0d96182f1f52 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_15.txt @@ -0,0 +1,2 @@ +If anything is +linked to a real model it belongs in the _forward method, anything else is in the preprocess/postprocess. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_16.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..48e6f9c7fcb68744e064e31e86947c5db58cb987 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_16.txt @@ -0,0 +1,2 @@ +postprocess methods will take the output of _forward and turn it into the final output that was decided +earlier. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_17.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b22bd0976e504b590b5c5bb70ba9faeacf89363 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_17.txt @@ -0,0 +1,2 @@ +_sanitize_parameters exists to allow users to pass any parameters whenever they wish, be it at initialization +time pipeline(., maybe_arg=4) or at call time pipe = pipeline(); output = pipe(., maybe_arg=4). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_18.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba758c09d5c8a9e328cc4820e5d729828ec11aba --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_18.txt @@ -0,0 +1,2 @@ +The returns of _sanitize_parameters are the 3 dicts of kwargs that will be passed directly to preprocess, +_forward, and postprocess. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_19.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b016b7f50761e4c0e656e2c6fe5c68aba08fbc2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_19.txt @@ -0,0 +1 @@ +Don't fill anything if the caller didn't call with any extra parameter. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_2.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..71caecbaa440ff4ea479401032e8edcf2b7785d2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_2.txt @@ -0,0 +1 @@ +First and foremost, you need to decide the raw entries the pipeline will be able to take. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_20.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..12d184892f31c722b557c09a365fc7f70fec6468 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_20.txt @@ -0,0 +1,2 @@ +That +allows to keep the default arguments in the function definition which is always more "natural". \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_21.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d889d7eb6725eaad5aee7e7bb6c9a4361c562847 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_21.txt @@ -0,0 +1 @@ +A classic example would be a top_k argument in the post processing in classification tasks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_22.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..261585d58d734984e5287dea41c18b8c22eb8ce2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_22.txt @@ -0,0 +1,11 @@ +thon + +pipe = pipeline("my-new-task") +pipe("This is a test") +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05} +{"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}] +pipe("This is a test", top_k=2) +[{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}] + +In order to achieve that, we'll update our postprocess method with a default parameter to 5. and edit +_sanitize_parameters to allow this new parameter. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_23.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..11e01d97a16b3192d03e621b76e0fb618014e807 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_23.txt @@ -0,0 +1,16 @@ +thon +def postprocess(self, model_outputs, top_k=5): + best_class = model_outputs["logits"].softmax(-1) + # Add logic to handle top_k + return best_class +def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "maybe_arg" in kwargs: + preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] +postprocess_kwargs = {} +if "top_k" in kwargs: + postprocess_kwargs["top_k"] = kwargs["top_k"] +return preprocess_kwargs, {}, postprocess_kwargs + +Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy +without requiring users to understand new kinds of objects. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_24.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..83eabcb6047becdec5f9cbcce9295f8881853a36 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_24.txt @@ -0,0 +1,24 @@ +It's also relatively common to support many different types +of arguments for ease of use (audio files, which can be filenames, URLs or pure bytes) +Adding it to the list of supported tasks +To register your new-task to the list of supported tasks, you have to add it to the PIPELINE_REGISTRY: +thon +from transformers.pipelines import PIPELINE_REGISTRY +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, +) + +You can specify a default model if you want, in which case it should come with a specific revision (which can be the name of a branch or a commit hash, here we took "abcdef") as well as the type: +python +PIPELINE_REGISTRY.register_pipeline( + "new-task", + pipeline_class=MyPipeline, + pt_model=AutoModelForSequenceClassification, + default={"pt": ("user/awesome_model", "abcdef")}, + type="text", # current support type: text, audio, image, multimodal +) +Share your pipeline on the Hub +To share your custom pipeline on the Hub, you just have to save the custom code of your Pipeline subclass in a +python file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_25.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..1658cdaf72837a9afb89c256120e40a686b91508 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_25.txt @@ -0,0 +1,31 @@ +For instance, let's say we want to use a custom pipeline for sentence pair classification like this: + +import numpy as np +from transformers import Pipeline +def softmax(outputs): + maxes = np.max(outputs, axis=-1, keepdims=True) + shifted_exp = np.exp(outputs - maxes) + return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) +class PairClassificationPipeline(Pipeline): + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + if "second_text" in kwargs: + preprocess_kwargs["second_text"] = kwargs["second_text"] + return preprocess_kwargs, {}, {} +def preprocess(self, text, second_text=None): + return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework) + +def _forward(self, model_inputs): + return self.model(**model_inputs) + +def postprocess(self, model_outputs): + logits = model_outputs.logits[0].numpy() + probabilities = softmax(logits) + + best_class = np.argmax(probabilities) + label = self.model.config.id2label[best_class] + score = probabilities[best_class].item() + logits = logits.tolist() + return {"label": label, "score": score, "logits": logits} + +The implementation is framework agnostic, and will work for PyTorch and TensorFlow models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_26.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac70d3913dda9fbb1781e8b5cc7da459579180bc --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_26.txt @@ -0,0 +1,14 @@ +If we have saved this in +a file named pair_classification.py, we can then import it and register it like this: + +from pair_classification import PairClassificationPipeline +from transformers.pipelines import PIPELINE_REGISTRY +from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification +PIPELINE_REGISTRY.register_pipeline( + "pair-classification", + pipeline_class=PairClassificationPipeline, + pt_model=AutoModelForSequenceClassification, + tf_model=TFAutoModelForSequenceClassification, +) + +Once this is done, we can use it with a pretrained model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_27.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f432f402363e6b33cc7ff30e0ae398be31d5487 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_27.txt @@ -0,0 +1,2 @@ +For instance sgugger/finetuned-bert-mrpc has been +fine-tuned on the MRPC dataset, which classifies pairs of sentences as paraphrases or not. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_28.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5a31cfd5eb59586a503e496b0cebf8915319385 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_28.txt @@ -0,0 +1,13 @@ +from transformers import pipeline +classifier = pipeline("pair-classification", model="sgugger/finetuned-bert-mrpc") + +Then we can share it on the Hub by using the save_pretrained method in a Repository: + +from huggingface_hub import Repository +repo = Repository("test-dynamic-pipeline", clone_from="{your_username}/test-dynamic-pipeline") +classifier.save_pretrained("test-dynamic-pipeline") +repo.push_to_hub() + +This will copy the file where you defined PairClassificationPipeline inside the folder "test-dynamic-pipeline", +along with saving the model and tokenizer of the pipeline, before pushing everything into the repository +{your_username}/test-dynamic-pipeline. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_29.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..99ad4e27a325273d86865a70b270af32f9cfe651 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_29.txt @@ -0,0 +1,9 @@ +After that, anyone can use it as long as they provide the option +trust_remote_code=True: + +from transformers import pipeline +classifier = pipeline(model="{your_username}/test-dynamic-pipeline", trust_remote_code=True) + +Add the pipeline to 🤗 Transformers +If you want to contribute your pipeline to 🤗 Transformers, you will need to add a new module in the pipelines submodule +with the code of your pipeline, then add it to the list of tasks defined in pipelines/__init__.py. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_3.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff3a28e97fc545d9578781c4b1b7f3ecc2d108a6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_3.txt @@ -0,0 +1,2 @@ +It can be strings, raw bytes, +dictionaries or whatever seems to be the most likely desired input. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_30.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..5431e984d61470d30ff253f0d0afa27614344bbd --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_30.txt @@ -0,0 +1 @@ +Then you will need to add tests. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_31.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..5f79de4953595133cf3bd26fc69010952006afc6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_31.txt @@ -0,0 +1 @@ +Create a new file tests/test_pipelines_MY_PIPELINE.py with examples of the other tests. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_32.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..af4f3b9e0a46eb942329d6e61e4653db62cc5e97 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_32.txt @@ -0,0 +1,2 @@ +The run_pipeline_test function will be very generic and run on small random models on every possible +architecture as defined by model_mapping and tf_model_mapping. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_33.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd66496f754123fda6ccb6ae9371e601bf241fc4 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_33.txt @@ -0,0 +1,2 @@ +This is very important to test future compatibility, meaning if someone adds a new model for +XXXForQuestionAnswering then the pipeline test will attempt to run on it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_34.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..12db79ba21306a54e7ee497171a6128bc8186be2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_34.txt @@ -0,0 +1,3 @@ +Because the models are random it's +impossible to check for actual values, that's why there is a helper ANY that will simply attempt to match the +output of the pipeline TYPE. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_35.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..03dd3325640275838b7e123cdc05d7e31f690e26 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_35.txt @@ -0,0 +1 @@ +You also need to implement 2 (ideally 4) tests. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_36.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..149d78103129a2bc8782e8ef440a073197a4a8fa --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_36.txt @@ -0,0 +1,2 @@ +test_small_model_pt : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) + and test the pipeline outputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_37.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..74dc6458399c0ac932e0894bd97ccc4691f80b7f --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_37.txt @@ -0,0 +1 @@ +The results should be the same as test_small_model_tf. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_38.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa627370b34f1cfdebed2bb0925bf915d554a74a --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_38.txt @@ -0,0 +1,2 @@ +test_small_model_tf : Define 1 small model for this pipeline (doesn't matter if the results don't make sense) + and test the pipeline outputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_39.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb3b2581e0ec604f2ec53114d3bd9028862d56c2 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_39.txt @@ -0,0 +1 @@ +The results should be the same as test_small_model_pt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_4.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..d38c34440744ba424fe077b145b0eb32315610da --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_4.txt @@ -0,0 +1,2 @@ +Try to keep these inputs as pure Python as possible +as it makes compatibility easier (even through other languages via JSON). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_40.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecde8ea2a7b3e642a501b74cb589a96d57ca00a9 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_40.txt @@ -0,0 +1,2 @@ +test_large_model_pt (optional): Tests the pipeline on a real pipeline where the results are supposed to + make sense. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_41.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..95bc7e94d74f2985c197f0f3e20a41aeea292156 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_41.txt @@ -0,0 +1 @@ +These tests are slow and should be marked as such. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_42.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..e79661ac7aa6ccabdfe21b4a4ca6380591e551c3 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_42.txt @@ -0,0 +1,2 @@ +Here the goal is to showcase the pipeline and to make + sure there is no drift in future releases. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_43.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc86cbca1ed6adeefaf3a5119ba7f245437fd2d5 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_43.txt @@ -0,0 +1,2 @@ +test_large_model_tf (optional): Tests the pipeline on a real pipeline where the results are supposed to + make sense. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_44.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..95bc7e94d74f2985c197f0f3e20a41aeea292156 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_44.txt @@ -0,0 +1 @@ +These tests are slow and should be marked as such. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_45.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..e79661ac7aa6ccabdfe21b4a4ca6380591e551c3 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_45.txt @@ -0,0 +1,2 @@ +Here the goal is to showcase the pipeline and to make + sure there is no drift in future releases. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_5.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff5bd0bc33329abbda039af13f3b51136c7d8003 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_5.txt @@ -0,0 +1,2 @@ +Those will be the inputs of the +pipeline (preprocess). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_6.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d7a5c44f02ab373ceea70060405286ee4de4894 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_6.txt @@ -0,0 +1 @@ +Then define the outputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_7.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..3db7f288646b646b202c6ba536390721b1e5584e --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_7.txt @@ -0,0 +1 @@ +Same policy as the inputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_8.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd96b0047a7de77941548088775a1ec40c913be0 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_8.txt @@ -0,0 +1 @@ +The simpler, the better. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_new_pipeline/chunk_9.txt b/chunked/nltk_chunking/_add_new_pipeline/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d95b95bcb49e77bd9c408a5c3208a52822774f6 --- /dev/null +++ b/chunked/nltk_chunking/_add_new_pipeline/chunk_9.txt @@ -0,0 +1,2 @@ +Those will be the outputs of +postprocess method. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_0.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..41e25140516d7c85704b5faf470c2e1f01a88740 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_0.txt @@ -0,0 +1,2 @@ + +How to convert a 🤗 Transformers model to TensorFlow? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_1.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..47a1118d20807477159deaecccfec11fc3f23660 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_1.txt @@ -0,0 +1,2 @@ +Having multiple frameworks available to use with 🤗 Transformers gives you flexibility to play their strengths when +designing your application, but it implies that compatibility must be added on a per-model basis. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_10.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..11c7f55e505df919cc9c13c6b523714f6250e00a --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_10.txt @@ -0,0 +1,2 @@ +Check the model_type field of the config.json of your model of choice +(example). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_100.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..949f8cce309b723d5948b1829df18ed0e07820f9 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_100.txt @@ -0,0 +1,2 @@ +TFBrandNewBertMainLayer is decorated with @keras_serializable +5. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_101.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..6239e7b829197d8901d2bfe6d68c6a34e21e6a91 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_101.txt @@ -0,0 +1,2 @@ +A TensorFlow model can be loaded from PyTorch weights using TFBrandNewBert.from_pretrained(model_repo, from_pt=True) +6. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_102.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbb09f31aa0b166332634b1c58e282cedd37a071 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_102.txt @@ -0,0 +1,2 @@ +You can call the TensorFlow model using the expected input format +5. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_103.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4e564c9fe71ebe3f3b40f9319130139a7482754 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_103.txt @@ -0,0 +1,2 @@ +Add model tests +Hurray, you've implemented a TensorFlow model! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_104.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..6189dd6effe391e7c25f0b6a5f9d8232b5e2519b --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_104.txt @@ -0,0 +1,2 @@ +Now it's time to add tests to make sure that your model behaves as +expected. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_105.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..a78b7f2efca28b1b7ae4b43f5e84b020f00bf00e --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_105.txt @@ -0,0 +1,3 @@ +As in the previous section, we suggest you start by copying the test_modeling_brand_new_bert.py file in +tests/models/brand_new_bert/ into test_modeling_tf_brand_new_bert.py, and continue by making the necessary +TensorFlow replacements. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_106.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..d84690c1fe9b989bdad93c417df7c89e1abc114c --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_106.txt @@ -0,0 +1,2 @@ +For now, in all .from_pretrained() calls, you should use the from_pt=True flag to load +the existing PyTorch weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_107.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..889cb0a06a4a4579d670bdb2e253dccd959e9eca --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_107.txt @@ -0,0 +1 @@ +After you're done, it's time for the moment of truth: run the tests! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_108.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e3aac92f444cf3ae83f7ccd14caaf095171eb37 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_108.txt @@ -0,0 +1,5 @@ +😬 + +NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ +py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py +The most likely outcome is that you'll see a bunch of errors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_109.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..a402ea336388d0ee320ffcd5d8cb0dac607f371b --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_109.txt @@ -0,0 +1 @@ +Don't worry, this is expected! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_11.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f135ba464262f8dcb1881a6bbe8b8e5a4e3979d --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_11.txt @@ -0,0 +1,3 @@ +If the corresponding model folder in +🤗 Transformers has a file whose name starts with "modeling_tf", it means that it has a corresponding TensorFlow +architecture (example). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_110.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d9203a7be417b3d8fea9ea04664f8aa64d17e12 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_110.txt @@ -0,0 +1,2 @@ +Debugging ML models is +notoriously hard, and the key ingredient to success is patience (and breakpoint()). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_111.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6eeb134627c3756f68a30189c2ba122b3990a36 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_111.txt @@ -0,0 +1,2 @@ +In our experience, the hardest +problems arise from subtle mismatches between ML frameworks, for which we have a few pointers at the end of this guide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_112.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..6638b1e2ebf9b0f087b99b531f8932b4acb810f6 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_112.txt @@ -0,0 +1,2 @@ +In other cases, a general test might not be directly applicable to your model, in which case we suggest an override +at the model test class level. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_113.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..36f0d2526f9b8c63cebe1915de9d8164566a922b --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_113.txt @@ -0,0 +1,2 @@ +Regardless of the issue, don't hesitate to ask for help in your draft pull request if +you're stuck. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_114.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..638960ee5bfc24906ba28dd7f53a8b738e217058 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_114.txt @@ -0,0 +1 @@ +When all tests pass, congratulations, your model is nearly ready to be added to the 🤗 Transformers library! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_115.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..a17a00b1ad605a3b841c0963e90f6d0b6985c2bf --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_115.txt @@ -0,0 +1,2 @@ +🎉 +6.-7. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_116.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..cae3a8d4f01746b7de126867d21602fe21b9c2d9 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_116.txt @@ -0,0 +1,2 @@ +Ensure everyone can use your model +6. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_117.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf145b8eef0150e6d44018d5923472f86a8e9f50 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_117.txt @@ -0,0 +1,2 @@ +Submit the pull request +Once you're done with the implementation and the tests, it's time to submit a pull request. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_118.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..751a69c310d820717255d7be862ba9ec4fb61d02 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_118.txt @@ -0,0 +1,2 @@ +Before pushing your code, +run our code formatting utility, make fixup 🪄. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_119.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..716ec7c5c2753b97822ae07f58d0af98b192e2e0 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_119.txt @@ -0,0 +1,2 @@ +This will automatically fix any formatting issues, which would cause +our automatic checks to fail. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_12.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..c43d2c6fddb32c7b43cbe54b1bdfa42e4d5f6f21 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_12.txt @@ -0,0 +1,2 @@ +Step-by-step guide to add TensorFlow model architecture code +There are many ways to design a large model architecture, and multiple ways of implementing said design. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_120.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..18536e2d2f8739922dd5632a6570395dab7a4ccd --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_120.txt @@ -0,0 +1 @@ +It's now time to convert your draft pull request into a real pull request. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_121.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec36a9ab4e83d35e3df02da98de85d121c23e47c --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_121.txt @@ -0,0 +1,2 @@ +To do so, click on the "Ready for +review" button and add Joao (@gante) and Matt (@Rocketknight1) as reviewers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_122.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5aed48b46731997b9ff4039469e058da7c0202a --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_122.txt @@ -0,0 +1,2 @@ +A model pull request will need +at least 3 reviewers, but they will take care of finding appropriate additional reviewers for your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_123.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..85884b5507361dccf3689604aee7daf6cc844dfc --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_123.txt @@ -0,0 +1,2 @@ +After all reviewers are happy with the state of your PR, the final action point is to remove the from_pt=True flag in +.from_pretrained() calls. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_124.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..78fb02ebfd6af6d8642fe17d3cbbdcf77807ee27 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_124.txt @@ -0,0 +1 @@ +Since there are no TensorFlow weights, you will have to add them! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_125.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5959261da66860eecef21e94f3c003ae1a9e0b5 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_125.txt @@ -0,0 +1,2 @@ +Check the section +below for instructions on how to do it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_126.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..1fb3572c622c25a4f4bfbcb9f11ddad3c052acb3 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_126.txt @@ -0,0 +1,6 @@ +Finally, when the TensorFlow weights get merged, you have at least 3 reviewer approvals, and all CI checks are +green, double-check the tests locally one last time + +NVIDIA_TF32_OVERRIDE=0 RUN_SLOW=1 RUN_PT_TF_CROSS_TESTS=1 \ +py.test -vv tests/models/brand_new_bert/test_modeling_tf_brand_new_bert.py +and we will merge your PR! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_127.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f569a995a77585a4de800e2457d1221fc0151bb --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_127.txt @@ -0,0 +1,2 @@ +Congratulations on the milestone 🎉 +7. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_128.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..adf410a2780439e0b86fdbde91e48d362d79bd47 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_128.txt @@ -0,0 +1,2 @@ +(Optional) Build demos and share with the world +One of the hardest parts about open-source is discovery. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_129.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4057a889255a97b6492fa2786d312a5596edc32 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_129.txt @@ -0,0 +1,2 @@ +How can the other users learn about the existence of your +fabulous TensorFlow contribution? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_13.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..69dbe0303794e6d53fa7cdcfe9028603a82f865c --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_13.txt @@ -0,0 +1,3 @@ +However, +you might recall from our general overview of 🤗 Transformers +that we are an opinionated bunch - the ease of use of 🤗 Transformers relies on consistent design choices. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_130.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b7e253639079eac07753fc97a9b65001f2e1215 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_130.txt @@ -0,0 +1 @@ +With proper communication, of course! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_131.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a7843cbb412993439bccb19e21d71297eb3c10f --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_131.txt @@ -0,0 +1,3 @@ +📣 +There are two main ways to share your model with the community: +- Build demos. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_132.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..466dd2bb10482cea7b85bfcdd7b1a45cdcbe3b0e --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_132.txt @@ -0,0 +1 @@ +These include Gradio demos, notebooks, and other fun ways to show off your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_133.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..8db5f9f027e780be6900ee410b901bde5cacb9b8 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_133.txt @@ -0,0 +1,2 @@ +We highly + encourage you to add a notebook to our community-driven demos. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_134.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9e716f1a24a21f96ce595ca5201e660835b17c9 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_134.txt @@ -0,0 +1 @@ +- Share stories on social media like Twitter and LinkedIn. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_135.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c69644ba29ad2031238833ff04e9b62f181cf49 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_135.txt @@ -0,0 +1,3 @@ +You should be proud of your work and share + your achievement with the community - your model can now be used by thousands of engineers and researchers around + the world ðŸŒ! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_136.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6f912b0481f352044e991da747d44b13332ee9c --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_136.txt @@ -0,0 +1 @@ +We will be happy to retweet your posts and help you share your work with the community. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_137.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b4b8adf309083d98eb0d2b558105ffd577b5edd --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_137.txt @@ -0,0 +1,3 @@ +Adding TensorFlow weights to 🤗 Hub +Assuming that the TensorFlow model architecture is available in 🤗 Transformers, converting PyTorch weights into +TensorFlow weights is a breeze! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_138.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7e02e1b2b5f46120f774acf9fc15493d6b95f4d --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_138.txt @@ -0,0 +1,2 @@ +Here's how to do it: +1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_139.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..35fe0b0919af36a70c44be91a315f2559b6e6534 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_139.txt @@ -0,0 +1 @@ +Make sure you are logged into your Hugging Face account in your terminal. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_14.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..2993fc905dd1b2777fadf8ee65a82901cf7669f5 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_14.txt @@ -0,0 +1,4 @@ +From +experience, we can tell you a few important things about adding TensorFlow models: + +Don't reinvent the wheel! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_140.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..2740121e15d8c6977403461a3a93bb65a7f51e88 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_140.txt @@ -0,0 +1,3 @@ +You can log in using the command + huggingface-cli login (you can find your access tokens here) +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_141.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..a34098fb57044b4fc1d413642095f05db262b0dc --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_141.txt @@ -0,0 +1,3 @@ +Run transformers-cli pt-to-tf --model-name foo/bar, where foo/bar is the name of the model repository + containing the PyTorch weights you want to convert +3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_142.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..2daf6e21fd685783df81ef60ade651f4f24df032 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_142.txt @@ -0,0 +1,2 @@ +Tag @joaogante and @Rocketknight1 in the 🤗 Hub PR the command above has just created +That's it! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_143.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9c688524a00285878a654a53e574c2949aaf398 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_143.txt @@ -0,0 +1,4 @@ +🎉 +Debugging mismatches across ML frameworks 🛠+At some point, when adding a new architecture or when creating TensorFlow weights for an existing architecture, you +might come across errors complaining about mismatches between PyTorch and TensorFlow. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_144.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4189257c01c387cb961896a348c82041d871906 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_144.txt @@ -0,0 +1,2 @@ +You might even decide to open the +model architecture code for the two frameworks, and find that they look identical. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_145.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..51d48ab88cb5ca75456f18828a335206538c77e0 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_145.txt @@ -0,0 +1 @@ +What's going on? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_146.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e56fb156c03c2d3ffc53d1d7811650b8bdbf370 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_146.txt @@ -0,0 +1,2 @@ +🤔 +First of all, let's talk about why understanding these mismatches matters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_147.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..f855fcfd2555e5d3151243c07b3d2a1483fd8146 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_147.txt @@ -0,0 +1,2 @@ +Many community members will use 🤗 +Transformers models out of the box, and trust that our models behave as expected. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_148.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..1093d338b94c26279a8aa4f44dd216e650363ae6 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_148.txt @@ -0,0 +1,3 @@ +When there is a large mismatch +between the two frameworks, it implies that the model is not following the reference implementation for at least one +of the frameworks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_149.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..adfe557ff1645f60e7605cc1fc3263c3e81a1af9 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_149.txt @@ -0,0 +1 @@ +This might lead to silent failures, in which the model runs but has poor performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_15.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d994c87da8dd16b9fe8338a911000044cb32a87 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_15.txt @@ -0,0 +1,2 @@ +More often than not, there are at least two reference implementations you should check: the +PyTorch equivalent of the model you are implementing and other TensorFlow models for the same class of problems. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_150.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..fae4c504acdbb1dfdad9389855f33d6d2d0cb2b9 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_150.txt @@ -0,0 +1,2 @@ +This is +arguably worse than a model that fails to run at all! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_151.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fb804117e992ed1ba34626b85ac3c2a5dedd6bd --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_151.txt @@ -0,0 +1,2 @@ +To that end, we aim at having a framework mismatch smaller than +1e-5 at all stages of the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_152.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..33459b9b881ab98c14c06bc48fb9a41bac275850 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_152.txt @@ -0,0 +1 @@ +As in other numerical problems, the devil is in the details. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_153.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf3b2bfe043ceeaacf3eddc150a5d615dea00159 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_153.txt @@ -0,0 +1,2 @@ +And as in any detail-oriented craft, the secret +ingredient here is patience. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_154.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf15448b11c8c0ad440d37187c7255f8d485c63c --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_154.txt @@ -0,0 +1,2 @@ +Here is our suggested workflow for when you come across this type of issues: +1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_155.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbd13d06e50c783df9b21679f57b95a684677e2b --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_155.txt @@ -0,0 +1 @@ +Locate the source of mismatches. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_156.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..bef5f891b90fcd5e4702cd511930156ac6d7062a --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_156.txt @@ -0,0 +1,2 @@ +The model you're converting probably has near identical inner variables up to a + certain point. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_157.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..11685f04972314f12213b255048c83e0587b5809 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_157.txt @@ -0,0 +1,2 @@ +Place breakpoint() statements in the two frameworks' architectures, and compare the values of the + numerical variables in a top-down fashion until you find the source of the problems. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_158.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_158.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_159.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..27d44c9b92977cf67e82cceb675e52d986abe855 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_159.txt @@ -0,0 +1 @@ +Now that you've pinpointed the source of the issue, get in touch with the 🤗 Transformers team. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_16.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ca1df8ea44cbdf384433a6c0697f80fb06bbcf1 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_16.txt @@ -0,0 +1 @@ +Great model implementations survive the test of time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_160.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b6d1131978c50ebd8ad015d711b88c603143f50 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_160.txt @@ -0,0 +1,2 @@ +It is possible + that we've seen a similar problem before and can promptly provide a solution. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_161.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b66534063c90e5629de81fee53c0d69252208ae --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_161.txt @@ -0,0 +1,2 @@ +As a fallback, scan popular pages + like StackOverflow and GitHub issues. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_162.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_162.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_163.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..6893531793829b173e62ed2cc1195f4c2e74e4e6 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_163.txt @@ -0,0 +1 @@ +If there is no solution in sight, it means you'll have to go deeper. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_164.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3a0c59d84c711d953ad9a559dd024c134767cfd --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_164.txt @@ -0,0 +1,2 @@ +The good news is that you've located the + issue, so you can focus on the problematic instruction, abstracting away the rest of the model! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_165.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..45722ca60a61bbeb8c8827ad9adc0703ff224e02 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_165.txt @@ -0,0 +1,2 @@ +The bad news is + that you'll have to venture into the source implementation of said instruction. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_166.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..721e86a3c84142d872f32ffbe3806962053001a4 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_166.txt @@ -0,0 +1,2 @@ +In some cases, you might find an + issue with a reference implementation - don't abstain from opening an issue in the upstream repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_167.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..410f5330af120bda18889a8a7e56a1d1722a6691 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_167.txt @@ -0,0 +1 @@ +In some cases, in discussion with the 🤗 Transformers team, we might find that fixing the mismatch is infeasible. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_168.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..b418062b60e09371f87b90c7abbf5c3b2146f670 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_168.txt @@ -0,0 +1,2 @@ +When the mismatch is very small in the output layers of the model (but potentially large in the hidden states), we +might decide to ignore it in favor of distributing the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_169.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..4325d2703008db2c37bc70a51f9cbe3f368e6afe --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_169.txt @@ -0,0 +1,2 @@ +The pt-to-tf CLI mentioned above has a --max-error +flag to override the error message at weight conversion time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_17.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..faf2ea3d24c2548526d5415cbca574684628def5 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_17.txt @@ -0,0 +1,2 @@ +This doesn't happen because the code is pretty, but rather +because the code is clear, easy to debug and build upon. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_18.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e8c236c8711622098b8a2d1d81ff156bad6f937 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_18.txt @@ -0,0 +1,3 @@ +If you make the life of the maintainers easy with your +TensorFlow implementation, by replicating the same patterns as in other TensorFlow models and minimizing the mismatch +to the PyTorch implementation, you ensure your contribution will be long lived. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_19.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd07f1f1515fd85ad833b8c7df60888ba300c90e --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_19.txt @@ -0,0 +1 @@ +Ask for help when you're stuck! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_2.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..96f7ff3a0ed63eec0f37b0336db35e16f95616bb --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_2.txt @@ -0,0 +1,2 @@ +The good news is that +adding TensorFlow compatibility to an existing model is simpler than adding a new model from scratch! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_20.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..38430aacb48300250445ecd0c612836dce3eb1a8 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_20.txt @@ -0,0 +1,2 @@ +The 🤗 Transformers team is here to help, and we've probably found solutions to the same +problems you're facing. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_21.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d094c065c40deaa2429669cb47ad7c5437c879ec --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_21.txt @@ -0,0 +1,2 @@ +Here's an overview of the steps needed to add a TensorFlow model architecture: +1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_22.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fd9c8c820b3fae40dd7f83b03a474c3224389f4 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_22.txt @@ -0,0 +1,2 @@ +Select the model you wish to convert +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_23.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..b21fd933ff47a364f2ba094387ff2402b1ad6c72 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_23.txt @@ -0,0 +1,2 @@ +Prepare transformers dev environment +3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_24.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc49379d9ad74a6bc3f3cf3012053108749e3cf4 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_24.txt @@ -0,0 +1,2 @@ +(Optional) Understand theoretical aspects and the existing implementation +4. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_25.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8c011c89187cd44da9f813dc1803f0c87e980c7 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_25.txt @@ -0,0 +1,2 @@ +Implement the model architecture +5. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_26.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..05f13e73eebe638f394d25302ec4517aea149473 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_26.txt @@ -0,0 +1,2 @@ +Implement model tests +6. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_27.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..293a99109ff0e735e58b005ca171b6f31ed7f6fc --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_27.txt @@ -0,0 +1,2 @@ +Submit the pull request +7. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_28.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..d85395e951fe5e2d6c5b2e41931492b52b7d46f4 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_28.txt @@ -0,0 +1,2 @@ +(Optional) Build demos and share with the world +1.-3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_29.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..6accc5564da7cdecd2ad1257489cdb7d64d80188 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_29.txt @@ -0,0 +1,2 @@ +Prepare your model contribution +1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_3.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..20eb5611937c5b0611167e1488b5de5ceeead4f0 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_3.txt @@ -0,0 +1,2 @@ +Whether you wish to have a deeper understanding of large TensorFlow models, make a major open-source contribution, or +enable TensorFlow for your model of choice, this guide is for you. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_30.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb7e6371285e730dfdbf9f9f108b1e99ac7d1d4d --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_30.txt @@ -0,0 +1,2 @@ +Select the model you wish to convert +Let's start off with the basics: the first thing you need to know is the architecture you want to convert. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_31.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..403af0e746f8fc145b10ff58823a71758589a276 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_31.txt @@ -0,0 +1,4 @@ +If you +don't have your eyes set on a specific architecture, asking the 🤗 Transformers team for suggestions is a great way to +maximize your impact - we will guide you towards the most prominent architectures that are missing on the TensorFlow +side. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_32.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..69a6ae17b97c2c7a0b23af5d8b13e0d0a98d58cd --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_32.txt @@ -0,0 +1,4 @@ +If the specific model you want to use with TensorFlow already has a TensorFlow architecture implementation in +🤗 Transformers but is lacking weights, feel free to jump straight into the +weight conversion section +of this page. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_33.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..6868207cd2908befd95a03e7c5dd432a6c72ee72 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_33.txt @@ -0,0 +1,2 @@ +For simplicity, the remainder of this guide assumes you've decided to contribute with the TensorFlow version of +BrandNewBert (the same example as in the guide to add a new model from scratch). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_34.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..19faf9844c963a5e3553995d24bd551226c730b8 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_34.txt @@ -0,0 +1 @@ +Before starting the work on a TensorFlow model architecture, double-check that there is no ongoing effort to do so. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_35.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..61d28710a39a37b11e108d8b717730bfce470270 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_35.txt @@ -0,0 +1,3 @@ +You can search for BrandNewBert on the +pull request GitHub page to confirm that there is no +TensorFlow-related pull request. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_36.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_36.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_37.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..42ea167512c47f0cf27d9fa28ff9bec491cc53f8 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_37.txt @@ -0,0 +1,2 @@ +Prepare transformers dev environment +Having selected the model architecture, open a draft PR to signal your intention to work on it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_38.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c644587795e5f3d9f32e0c4c8c847a57a198441 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_38.txt @@ -0,0 +1,2 @@ +Follow the +instructions below to set up your environment and open a draft PR. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_39.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fed09693d1b6ae63f18afd422ff8ceae5053a73 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_39.txt @@ -0,0 +1,2 @@ +Fork the repository by clicking on the 'Fork' button on the + repository's page. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_4.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..e96f30dec0529d17764c237bd5bc594229edcd24 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_4.txt @@ -0,0 +1,2 @@ +This guide empowers you, a member of our community, to contribute TensorFlow model weights and/or +architectures to be used in 🤗 Transformers, with minimal supervision from the Hugging Face team. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_40.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..18667b7524d4e404b0dfd64d346e2b36dc306359 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_40.txt @@ -0,0 +1 @@ +This creates a copy of the code under your GitHub user account. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_41.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ad8cb9c836242cc381455c53766535cf0ed1fd9 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_41.txt @@ -0,0 +1,11 @@ +Clone your transformers fork to your local disk, and add the base repository as a remote: + +git clone https://github.com/[your Github handle]/transformers.git +cd transformers +git remote add upstream https://github.com/huggingface/transformers.git + +Set up a development environment, for instance by running the following command: + +python -m venv .env +source .env/bin/activate +pip install -e ". \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_42.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..7077265d5a98371e491ed533e396745e2c8849e5 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_42.txt @@ -0,0 +1,3 @@ +[dev]" +Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a +failure with this command. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_43.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fb2bf470a0301792f578f97ff183af20dde43ba --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_43.txt @@ -0,0 +1,3 @@ +If that's the case make sure to install TensorFlow then do: + +pip install -e ". \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_44.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f55bc3728bfd82a55ba8e875dbf09d43e4ac4cb --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_44.txt @@ -0,0 +1,2 @@ +[quality]" +Note: You don't need to have CUDA installed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_45.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..14c793419bb9a549ab3a1a132edcbaba9ce6958d --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_45.txt @@ -0,0 +1 @@ +Making the new model work on CPU is sufficient. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_46.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..50f12924deeddf2e7585d8fe03946e4784cd58bd --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_46.txt @@ -0,0 +1,10 @@ +Create a branch with a descriptive name from your main branch + +git checkout -b add_tf_brand_new_bert + +Fetch and rebase to current main + +git fetch upstream +git rebase upstream/main + +Add an empty .py file in transformers/src/models/brandnewbert/ named modeling_tf_brandnewbert.py. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_47.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc6fa396deda74b600d25626c1b2010f12e1c5b5 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_47.txt @@ -0,0 +1,2 @@ +This will +be your TensorFlow model file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_48.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8988c03a4e6f202d69bf4f2e2682a5fd5dc8dd8 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_48.txt @@ -0,0 +1,3 @@ +Push the changes to your account using: + +git add . \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_49.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4b29c5882912029fe8daf9df74de1e4e5db6f78 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_49.txt @@ -0,0 +1,4 @@ +git commit -m "initial commit" +git push -u origin add_tf_brand_new_bert + +Once you are satisfied, go to the webpage of your fork on GitHub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_5.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..62b71c81f51f31da80c53a8fd1950896360f3e96 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_5.txt @@ -0,0 +1,2 @@ +Writing a new model +is no small feat, but hopefully this guide will make it less of a rollercoaster 🎢 and more of a walk in the park 🚶. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_50.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..a75556052af8e2c70205a6902f1b28b7aa5eb5f5 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_50.txt @@ -0,0 +1 @@ +Click on “Pull requestâ€. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_51.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d0b1892d3343828da4f6090d9709dfe62779eb1 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_51.txt @@ -0,0 +1,3 @@ +Make sure to add the + GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for + future changes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_52.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..db3be93eee9eb5990852f4089e0fd533d85a1ef4 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_52.txt @@ -0,0 +1 @@ +Change the PR into a draft by clicking on “Convert to draft†on the right of the GitHub pull request web page. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_53.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce7cd94d26119e111d7d141c76ab9b318847757c --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_53.txt @@ -0,0 +1 @@ +Now you have set up a development environment to port BrandNewBert to TensorFlow in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_54.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_54.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_55.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb54f07b41b6d05c6fc647b9c827880ea0958716 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_55.txt @@ -0,0 +1,2 @@ +(Optional) Understand theoretical aspects and the existing implementation +You should take some time to read BrandNewBert's paper, if such descriptive work exists. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_56.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd4be4688023f1f2e3249bdb9bea1fd5c6fd6fe4 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_56.txt @@ -0,0 +1,2 @@ +There might be large +sections of the paper that are difficult to understand. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_57.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fb9943803b4b684b35e49675ea115a4a755b52b --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_57.txt @@ -0,0 +1 @@ +If this is the case, this is fine - don't worry! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_58.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b03ee98a9ea6eb018ac250c052b437ca5d53f32 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_58.txt @@ -0,0 +1,3 @@ +The goal is +not to get a deep theoretical understanding of the paper, but to extract the necessary information required to +effectively re-implement the model in 🤗 Transformers using TensorFlow. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_59.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dd8ca70a28442f2dd4703003fd9060213554c27 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_59.txt @@ -0,0 +1,3 @@ +That being said, you don't have to spend too +much time on the theoretical aspects, but rather focus on the practical ones, namely the existing model documentation +page (e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_6.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8891c88eef1f00ae93b4b0731b43ea57b6e9893 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_6.txt @@ -0,0 +1,2 @@ +Harnessing our collective experiences is absolutely critical to make this process increasingly easier, and thus we +highly encourage that you suggest improvements to this guide! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_60.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..4273a7e53f73a3eba3de56b8d96c6f0dd2313f6a --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_60.txt @@ -0,0 +1 @@ +model docs for BERT). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_61.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..339a9fd947e39123c6fd823bac238a878cac693a --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_61.txt @@ -0,0 +1,2 @@ +After you've grasped the basics of the models you are about to implement, it's important to understand the existing +implementation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_62.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..00e345c5327d0a58907691051d325823776ad6d0 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_62.txt @@ -0,0 +1,2 @@ +This is a great chance to confirm that a working implementation matches your expectations for the +model, as well as to foresee technical challenges on the TensorFlow side. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_63.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..95ab810c7e4a4edad10b50f22763a94fe7ed8e88 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_63.txt @@ -0,0 +1 @@ +It's perfectly natural that you feel overwhelmed with the amount of information that you've just absorbed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_64.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e795c308bbb6af0b180764333bdd09b339db90 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_64.txt @@ -0,0 +1,2 @@ +It is +definitely not a requirement that you understand all facets of the model at this stage. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_65.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d22be201424e2ef5afa724c0084415ab06f5ee6 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_65.txt @@ -0,0 +1,2 @@ +Nevertheless, we highly +encourage you to clear any pressing questions in our forum. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_66.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd249e29f9fefd5ef6445828f1394f228bd97f1 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_66.txt @@ -0,0 +1 @@ +4. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_67.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3e24f0512d350d397c8e0454fbff4db2c12f25d --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_67.txt @@ -0,0 +1,2 @@ +Model implementation +Now it's time to finally start coding. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_68.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3e5514feaa37633cf2d19f88ca4df48b315c9d5 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_68.txt @@ -0,0 +1,3 @@ +Our suggested starting point is the PyTorch file itself: copy the contents of +modeling_brand_new_bert.py inside src/transformers/models/brand_new_bert/ into +modeling_tf_brand_new_bert.py. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_69.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..640135098aa5ff474bce21bb3cc339488def11a3 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_69.txt @@ -0,0 +1,3 @@ +The goal of this section is to modify the file and update the import structure of +🤗 Transformers such that you can import TFBrandNewBert and +TFBrandNewBert.from_pretrained(model_repo, from_pt=True) successfully loads a working TensorFlow BrandNewBert model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_7.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d50e067ecb153ee89eba459fa9b86f50d15db33 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_7.txt @@ -0,0 +1,6 @@ +Before you dive deeper, it is recommended that you check the following resources if you're new to 🤗 Transformers: +- General overview of 🤗 Transformers +- Hugging Face's TensorFlow Philosophy +In the remainder of this guide, you will learn what's needed to add a new TensorFlow model architecture, the +procedure to convert PyTorch into TensorFlow model weights, and how to efficiently debug mismatches across ML +frameworks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_70.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..5891c283f03324c301e6d6856f2ef370d1d43f3d --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_70.txt @@ -0,0 +1 @@ +Sadly, there is no prescription to convert a PyTorch model into TensorFlow. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_71.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..320fce569ebead266006dfd2aac1488b0cc95cdc --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_71.txt @@ -0,0 +1,3 @@ +You can, however, follow our selection of +tips to make the process as smooth as possible: +- Prepend TF to the name of all classes (e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_72.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..429544a09708daeb20fa1a2105d7f39d92841732 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_72.txt @@ -0,0 +1 @@ +BrandNewBert becomes TFBrandNewBert). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_73.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9b25707e4da6cd403245a2dc5a3a026a6212a27 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_73.txt @@ -0,0 +1 @@ +- Most PyTorch operations have a direct TensorFlow replacement. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_74.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..be36cfaf789b1a6e8110567127a88535e27c97fe --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_74.txt @@ -0,0 +1,2 @@ +For example, torch.nn.Linear corresponds to + tf.keras.layers.Dense, torch.nn.Dropout corresponds to tf.keras.layers.Dropout, etc. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_75.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..d943ab82e12418c8cbf97aa9c18bb30172a9437a --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_75.txt @@ -0,0 +1,3 @@ +If you're not sure + about a specific operation, you can use the TensorFlow documentation + or the PyTorch documentation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_76.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a7680f70747d0de34a21bdf233f8a8454205d2d --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_76.txt @@ -0,0 +1 @@ +- Look for patterns in the 🤗 Transformers codebase. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_77.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..b51d69c475ca08505b8ef0a27aa3ff32ff2e1cb7 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_77.txt @@ -0,0 +1,2 @@ +If you come across a certain operation that doesn't have a direct + replacement, the odds are that someone else already had the same problem. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_78.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5b9351e084b7afd79958dd7df1243e514e1d492 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_78.txt @@ -0,0 +1 @@ +- By default, keep the same variable names and structure as in PyTorch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_79.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..b86574bc9428b228f2e68fe7ca18232e2c8c9546 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_79.txt @@ -0,0 +1,2 @@ +This will make it easier to debug, track + issues, and add fixes down the line. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_8.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3e1fe5345004122e63f7860bf6cec9f03f1d666 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_8.txt @@ -0,0 +1 @@ +Let's get started! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_80.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..858de50ccbf44a02f5025d4a17e1148d13d34fdd --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_80.txt @@ -0,0 +1 @@ +- Some layers have different default values in each framework. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_81.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f596546bdd1cc848062eb6d5cb2280011b8dba4 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_81.txt @@ -0,0 +1,3 @@ +A notable example is the batch normalization layer's + epsilon (1e-5 in PyTorch + and 1e-3 in TensorFlow). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_82.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..692bdc68baac22250fe518b9f1d663f8a7abf2a2 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_82.txt @@ -0,0 +1 @@ +Double-check the documentation! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_83.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..75bf9db310e6f4b2ebafe582710e328696c7d65e --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_83.txt @@ -0,0 +1 @@ +- PyTorch's nn.Parameter variables typically need to be initialized within TF Layer's build(). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_84.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e0fa3e143ff050bb62c049e5b04987585ba7db3 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_84.txt @@ -0,0 +1,5 @@ +See the following + example: PyTorch / + TensorFlow +- If the PyTorch model has a #copied from on top of a function, the odds are that your TensorFlow model can also + borrow that function from the architecture it was copied from, assuming it has a TensorFlow architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_85.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..abc557e79fc5b2d61adcee3623c60a10d0b9b531 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_85.txt @@ -0,0 +1,2 @@ +- Assigning the name attribute correctly in TensorFlow functions is critical to do the from_pt=True weight + cross-loading. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_86.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..89774d055bf7b40de31219eaf69fc2e64db05467 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_86.txt @@ -0,0 +1 @@ +name is almost always the name of the corresponding variable in the PyTorch code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_87.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..32270da4e6231232a5702a9f1f439f609f47e94a --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_87.txt @@ -0,0 +1,2 @@ +If name is not + properly set, you will see it in the error message when loading the model weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_88.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..d711c59de32c29a9a04bab85a9283d7bd3b69fe9 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_88.txt @@ -0,0 +1,2 @@ +- The logic of the base model class, BrandNewBertModel, will actually reside in TFBrandNewBertMainLayer, a Keras + layer subclass (example). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_89.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..00daa07ab9ad808d639a95cc7001912890fd5ca3 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_89.txt @@ -0,0 +1 @@ +TFBrandNewBertModel will simply be a wrapper around this layer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_9.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2b070921719b404f20414d3cc18abc991fb07a0 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_9.txt @@ -0,0 +1 @@ +Are you unsure whether the model you wish to use already has a corresponding TensorFlow architecture? \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_90.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..fea6610c1a3c1d162d0fee070f2a89e6ecc4e539 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_90.txt @@ -0,0 +1 @@ +- Keras models need to be built in order to load pretrained weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_91.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..f640b836da1f14834b428604542a85b59f4b2d97 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_91.txt @@ -0,0 +1,3 @@ +For that reason, TFBrandNewBertPreTrainedModel + will need to hold an example of inputs to the model, the dummy_inputs + (example). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_92.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..de274a11b0d883dc7248af0dad7ccf6460061d30 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_92.txt @@ -0,0 +1 @@ +- If you get stuck, ask for help - we're here to help you! \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_93.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e39528a38ef713a67b6d5979cbabefd2ade0d35 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_93.txt @@ -0,0 +1,3 @@ +🤗 +In addition to the model file itself, you will also need to add the pointers to the model classes and related +documentation pages. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_94.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ce925560eba54b209b1ffd353e524c52f4934f7 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_94.txt @@ -0,0 +1,2 @@ +You can complete this part entirely following the patterns in other PRs +(example). \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_95.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..7636e39dc0e4bdd4449318de6944c49d1ff070cd --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_95.txt @@ -0,0 +1,12 @@ +Here's a list of the needed manual +changes: +- Include all public classes of BrandNewBert in src/transformers/__init__.py +- Add BrandNewBert classes to the corresponding Auto classes in src/transformers/models/auto/modeling_tf_auto.py +- Add the lazy loading classes related to BrandNewBert in src/transformers/utils/dummy_tf_objects.py +- Update the import structures for the public classes in src/transformers/models/brand_new_bert/__init__.py +- Add the documentation pointers to the public methods of BrandNewBert in docs/source/en/model_doc/brand_new_bert.md +- Add yourself to the list of contributors to BrandNewBert in docs/source/en/model_doc/brand_new_bert.md +- Finally, add a green tick ✅ to the TensorFlow column of BrandNewBert in docs/source/en/index.md +When you're happy with your implementation, run the following checklist to confirm that your model architecture is +ready: +1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_96.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..00f64d8ee504b285abf8ce92a4bd2f7b46d1c222 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_96.txt @@ -0,0 +1 @@ +All layers that behave differently at train time (e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_97.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..20972499389086bf2e3a4ee1c136acfc4d3764a7 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_97.txt @@ -0,0 +1,3 @@ +Dropout) are called with a training argument, which is +propagated all the way from the top-level classes +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_98.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..5044c6f9f51923c6f64a9720d05054be31a9fa79 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_98.txt @@ -0,0 +1,2 @@ +You have used #copied from whenever possible +3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_add_tensorflow_model/chunk_99.txt b/chunked/nltk_chunking/_add_tensorflow_model/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..0dcfd78b6063ed2137a896729e8e06bfa57de6e3 --- /dev/null +++ b/chunked/nltk_chunking/_add_tensorflow_model/chunk_99.txt @@ -0,0 +1,2 @@ +TFBrandNewBertMainLayer and all classes that use it have their call function decorated with @unpack_inputs +4. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_0.txt b/chunked/nltk_chunking/_attention/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..e671b08bf9fd73e7148ffad3c3fd13fca669b3a5 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_0.txt @@ -0,0 +1,3 @@ + +Attention mechanisms +Most transformer models use full attention in the sense that the attention matrix is square. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_1.txt b/chunked/nltk_chunking/_attention/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e90ca68d240e8fd99c90fb261118bd3c1007dd4 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_1.txt @@ -0,0 +1,2 @@ +It can be a big +computational bottleneck when you have long texts. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_10.txt b/chunked/nltk_chunking/_attention/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad907bafe20835ee3a61aa5b296dc993745a0e93 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_10.txt @@ -0,0 +1 @@ +is enough to take action for a given token. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_11.txt b/chunked/nltk_chunking/_attention/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..99693c67e975266b83f95e825830e6dd5b2fe569 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_11.txt @@ -0,0 +1,3 @@ +Also, by stacking attention layers that have a small +window, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a +representation of the whole sentence. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_12.txt b/chunked/nltk_chunking/_attention/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..4acffa1a68406cf55b9498bcecc9751b6df1b248 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_12.txt @@ -0,0 +1,3 @@ +Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access +all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in +their local window). \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_13.txt b/chunked/nltk_chunking/_attention/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..857d8149703dbbd57cab0bff93619f7cda5b8285 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_13.txt @@ -0,0 +1,4 @@ +This is shown in Figure 2d of the paper, see below for a sample attention mask: + +Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence +length. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_14.txt b/chunked/nltk_chunking/_attention/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..a45824410f1f6d30e0c89008aba8e67f044f085c --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_14.txt @@ -0,0 +1,5 @@ +Other tricks +Axial positional encodings +Reformer uses axial positional encodings: in traditional transformer models, the positional encoding +E is a matrix of size \(l\) by \(d\), \(l\) being the sequence length and \(d\) the dimension of the +hidden state. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_15.txt b/chunked/nltk_chunking/_attention/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b9647f01226986ba87c4f4c98bf8d8cd07ca007 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_15.txt @@ -0,0 +1 @@ +If you have very long texts, this matrix can be huge and take way too much space on the GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_16.txt b/chunked/nltk_chunking/_attention/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8ac4c60c99b81a16696add15573fda603b3cd01 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_16.txt @@ -0,0 +1,4 @@ +To alleviate +that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and E2, with +dimensions \(l_{1} \times d_{1}\) and \(l_{2} \times d_{2}\), such that \(l_{1} \times l_{2} = l\) and +\(d_{1} + d_{2} = d\) (with the product for the lengths, this ends up being way smaller). \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_17.txt b/chunked/nltk_chunking/_attention/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..caa542befdad14c5341d2223093127e6e000ca35 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_17.txt @@ -0,0 +1,3 @@ +The embedding for time +step \(j\) in E is obtained by concatenating the embeddings for timestep \(j \% l1\) in E1 and \(j // l1\) +in E2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_2.txt b/chunked/nltk_chunking/_attention/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6e09e9fec5f832a72b679e55e2d68a3dcb12ef8 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_2.txt @@ -0,0 +1,2 @@ +Longformer and reformer are models that try to be more efficient and +use a sparse version of the attention matrix to speed up training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_3.txt b/chunked/nltk_chunking/_attention/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc18fba63863d9c02b28962cddce8af5a8a62c93 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_3.txt @@ -0,0 +1,2 @@ +LSH attention +Reformer uses LSH attention. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_4.txt b/chunked/nltk_chunking/_attention/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c33f51fbf86463da9f9c4ad2ee0091f745fd4cf --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_4.txt @@ -0,0 +1,2 @@ +In the softmax(QK^t), only the biggest elements (in the softmax +dimension) of the matrix QK^t are going to give useful contributions. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_5.txt b/chunked/nltk_chunking/_attention/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..fde7cc7c4044b3904d32ff00a86dae12867eaaf5 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_5.txt @@ -0,0 +1,2 @@ +So for each query q in Q, we can consider only +the keys k in K that are close to q. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_6.txt b/chunked/nltk_chunking/_attention/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4c8b60753b0fdb544687a70b13e48ed8c4ff886 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_6.txt @@ -0,0 +1 @@ +A hash function is used to determine if q and k are close. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_7.txt b/chunked/nltk_chunking/_attention/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1604f7ae8a068d245bebbcf74deaca0c8afa048 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_7.txt @@ -0,0 +1,3 @@ +The attention mask is +modified to mask the current token (except at the first position), because it will give a query and a key equal (so +very similar to each other). \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_8.txt b/chunked/nltk_chunking/_attention/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..441d7ec1b84de8cd660bee75ac934afa0fb41f00 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_8.txt @@ -0,0 +1,2 @@ +Since the hash can be a bit random, several hash functions are used in practice +(determined by a n_rounds parameter) and then are averaged together. \ No newline at end of file diff --git a/chunked/nltk_chunking/_attention/chunk_9.txt b/chunked/nltk_chunking/_attention/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..2bb75c6010096060aecdca467b46e22ddface1e9 --- /dev/null +++ b/chunked/nltk_chunking/_attention/chunk_9.txt @@ -0,0 +1,3 @@ +Local attention +Longformer uses local attention: often, the local context (e.g., what are the two tokens to the +left and right?) \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_0.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b7e744fe93360bd70003d704ae3744b4dca76cb --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_0.txt @@ -0,0 +1,3 @@ + +Load pretrained instances with an AutoClass +With so many different Transformer architectures, it can be challenging to create one for your checkpoint. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_1.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e677d973d459836b4409deee30bafb537eb3161 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_1.txt @@ -0,0 +1 @@ +As a part of 🤗 Transformers core philosophy to make the library easy, simple and flexible to use, an AutoClass automatically infers and loads the correct architecture from a given checkpoint. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_10.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb85defdc44c5207ccdbfbd1c9a5b61fde83156d --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_10.txt @@ -0,0 +1 @@ +Load a pretrained model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_11.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae62aee56db63223002af73d73a4df39242bfc89 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_11.txt @@ -0,0 +1 @@ +Load a model as a backbone. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_12.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..3992a614ef8e33902ce76d66109602befee49eef --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_12.txt @@ -0,0 +1,2 @@ +AutoTokenizer +Nearly every NLP task begins with a tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_13.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..f43cf502f5fbca13d1ec99b89a83803339916ed2 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_13.txt @@ -0,0 +1 @@ +A tokenizer converts your input into a format that can be processed by the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_14.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..0961b799f1641532db601990962543127eb5367d --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_14.txt @@ -0,0 +1,8 @@ +Load a tokenizer with [AutoTokenizer.from_pretrained]: + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + +Then tokenize your input as shown below: + +sequence = "In a hole in the ground there lived a hobbit." \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_15.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..c61a648b29e77d369c8e056b01c79d951b4e08bf --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_15.txt @@ -0,0 +1,7 @@ +print(tokenizer(sequence)) +{'input_ids': [101, 1999, 1037, 4920, 1999, 1996, 2598, 2045, 2973, 1037, 7570, 10322, 4183, 1012, 102], + 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]} + +AutoImageProcessor +For vision tasks, an image processor processes the image into the correct input format. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_16.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c8d054bf6cf44dd1f17284682dbed3f6edf0418 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_16.txt @@ -0,0 +1,6 @@ +from transformers import AutoImageProcessor +image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") + +AutoBackbone + +A Swin backbone with multiple stages for outputting a feature map. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_17.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9403e2af42d127f3a6b639b7d16d460dec5bc57 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_17.txt @@ -0,0 +1 @@ +The [AutoBackbone] lets you use pretrained models as backbones to get feature maps from different stages of the backbone. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_18.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4d860420efc1c51baa993b4104739d20fbd6d4b --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_18.txt @@ -0,0 +1,6 @@ +You should specify one of the following parameters in [~PretrainedConfig.from_pretrained]: + +out_indices is the index of the layer you'd like to get the feature map from +out_features is the name of the layer you'd like to get the feature map from + +These parameters can be used interchangeably, but if you use both, make sure they're aligned with each other! \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_19.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe14065b91b53fd4ffd2a4151da9437f5f3c69fe --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_19.txt @@ -0,0 +1 @@ +If you don't pass any of these parameters, the backbone returns the feature map from the last layer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_2.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..bba4b3c2269bafc72e47e5740281e9a39545da9b --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_2.txt @@ -0,0 +1 @@ +The from_pretrained() method lets you quickly load a pretrained model for any architecture so you don't have to devote time and resources to train a model from scratch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_20.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..83fd4403602296f0145b0ac99a262fcedb602ce3 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_20.txt @@ -0,0 +1 @@ +A feature map from the first stage of the backbone. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_21.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..64a8b9d28bb2ef1d4882b64b9c5b942785ed801d --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_21.txt @@ -0,0 +1 @@ +The patch partition refers to the model stem. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_22.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1ab6209a26c53f0e5367e47c378e7ed18795a83 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_22.txt @@ -0,0 +1,21 @@ +For example, in the above diagram, to return the feature map from the first stage of the Swin backbone, you can set out_indices=(1,): + +from transformers import AutoImageProcessor, AutoBackbone +import torch +from PIL import Image +import requests +url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(url, stream=True).raw) +processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224") +model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,)) +inputs = processor(image, return_tensors="pt") +outputs = model(**inputs) +feature_maps = outputs.feature_maps + +Now you can access the feature_maps object from the first stage of the backbone: + +list(feature_maps[0].shape) +[1, 96, 56, 56] + +AutoFeatureExtractor +For audio tasks, a feature extractor processes the audio signal the correct input format. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_23.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..24186810a73f24c541f6153d611a7799e594886d --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_23.txt @@ -0,0 +1,9 @@ +Load a feature extractor with [AutoFeatureExtractor.from_pretrained]: + +from transformers import AutoFeatureExtractor +feature_extractor = AutoFeatureExtractor.from_pretrained( + "ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition" + ) + +AutoProcessor +Multimodal tasks require a processor that combines two types of preprocessing tools. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_24.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..98881320095453132a81110693653f23a5b56280 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_24.txt @@ -0,0 +1 @@ +For example, the LayoutLMV2 model requires an image processor to handle images and a tokenizer to handle text; a processor combines both of them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_25.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..f22fe7178f5505a73549c427a03feacd3701fa9d --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_25.txt @@ -0,0 +1,8 @@ +Load a processor with [AutoProcessor.from_pretrained]: + +from transformers import AutoProcessor +processor = AutoProcessor.from_pretrained("microsoft/layoutlmv2-base-uncased") + +AutoModel + +The AutoModelFor classes let you load a pretrained model for a given task (see here for a complete list of available tasks). \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_26.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..83f19848c1a29b4cdee1dc3f35e1aada85956468 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_26.txt @@ -0,0 +1,11 @@ +For example, load a model for sequence classification with [AutoModelForSequenceClassification.from_pretrained]: + +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + +Easily reuse the same checkpoint to load an architecture for a different task: + +from transformers import AutoModelForTokenClassification +model = AutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased") + +For PyTorch models, the from_pretrained() method uses torch.load() which internally uses pickle and is known to be insecure. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_27.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d238751152d91f47dac151dac30f570f8049a391 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_27.txt @@ -0,0 +1 @@ +In general, never load a model that could have come from an untrusted source, or that could have been tampered with. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_28.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..f3aeffa7647b546d20114cb6bd9389dcf240450a --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_28.txt @@ -0,0 +1 @@ +This security risk is partially mitigated for public models hosted on the Hugging Face Hub, which are scanned for malware at each commit. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_29.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..b621b870ee24027af707349b1b84f98ac5df918e --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_29.txt @@ -0,0 +1 @@ +See the Hub documentation for best practices like signed commit verification with GPG. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_3.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bb771fa5e14e39c9e8c9435d4d210a49e2021e2 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_3.txt @@ -0,0 +1 @@ +Producing this type of checkpoint-agnostic code means if your code works for one checkpoint, it will work with another checkpoint - as long as it was trained for a similar task - even if the architecture is different. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_30.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..d627d595f168c06f33a8e6b933744f277074d1c2 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_30.txt @@ -0,0 +1 @@ +TensorFlow and Flax checkpoints are not affected, and can be loaded within PyTorch architectures using the from_tf and from_flax kwargs for the from_pretrained method to circumvent this issue. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_31.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..af09b10999348dada07ce540eaf0970f398efcc5 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_31.txt @@ -0,0 +1 @@ +Generally, we recommend using the AutoTokenizer class and the AutoModelFor class to load pretrained instances of models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_32.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..828720267f6af0e92162369230eb8e6ae51285a6 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_32.txt @@ -0,0 +1 @@ +This will ensure you load the correct architecture every time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_33.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..1356b63f0523b8a7152097df37caf3717af89282 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_33.txt @@ -0,0 +1 @@ +In the next tutorial, learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_34.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d05e5f94e7b75d50f1922ffffb8ca4e3b58ed8a --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_34.txt @@ -0,0 +1 @@ +Finally, the TFAutoModelFor classes let you load a pretrained model for a given task (see here for a complete list of available tasks). \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_35.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5f3a1ad0b2f9502e0a1c3a75e569d88f7642221 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_35.txt @@ -0,0 +1,11 @@ +For example, load a model for sequence classification with [TFAutoModelForSequenceClassification.from_pretrained]: + +from transformers import TFAutoModelForSequenceClassification +model = TFAutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + +Easily reuse the same checkpoint to load an architecture for a different task: + +from transformers import TFAutoModelForTokenClassification +model = TFAutoModelForTokenClassification.from_pretrained("distilbert/distilbert-base-uncased") + +Generally, we recommend using the AutoTokenizer class and the TFAutoModelFor class to load pretrained instances of models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_36.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..828720267f6af0e92162369230eb8e6ae51285a6 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_36.txt @@ -0,0 +1 @@ +This will ensure you load the correct architecture every time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_37.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..1356b63f0523b8a7152097df37caf3717af89282 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_37.txt @@ -0,0 +1 @@ +In the next tutorial, learn how to use your newly loaded tokenizer, image processor, feature extractor and processor to preprocess a dataset for fine-tuning. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_4.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..efb2b46d8cc870416afad6a3f1c4daed0565c014 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_4.txt @@ -0,0 +1 @@ +Remember, architecture refers to the skeleton of the model and checkpoints are the weights for a given architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_5.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a0fdb3be10e71e0899415c3d7fce1cdc2eb6642 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_5.txt @@ -0,0 +1 @@ +For example, BERT is an architecture, while google-bert/bert-base-uncased is a checkpoint. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_6.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..1509870cdc594ed3f095d86dc3e81cd58f42e4a8 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_6.txt @@ -0,0 +1 @@ +Model is a general term that can mean either architecture or checkpoint. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_7.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..631c7acd75a7585f978be7f137bcc1c4e5365ef0 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_7.txt @@ -0,0 +1,3 @@ +In this tutorial, learn to: + +Load a pretrained tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_8.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..e782c58d81a967446664d1a641319e8e7531d6b8 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_8.txt @@ -0,0 +1,2 @@ +Load a pretrained image processor +Load a pretrained feature extractor. \ No newline at end of file diff --git a/chunked/nltk_chunking/_autoclass_tutorial/chunk_9.txt b/chunked/nltk_chunking/_autoclass_tutorial/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfc650493b27bc4465221b782faa8c1b55778b07 --- /dev/null +++ b/chunked/nltk_chunking/_autoclass_tutorial/chunk_9.txt @@ -0,0 +1 @@ +Load a pretrained processor. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_0.txt b/chunked/nltk_chunking/_benchmarks/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1cd1728a024b80509bf33fe1398979eeada79ee --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_0.txt @@ -0,0 +1,5 @@ + +Benchmarks + +Hugging Face's Benchmarking tools are deprecated and it is advised to use external Benchmarking libraries to measure the speed +and memory complexity of Transformer models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_1.txt b/chunked/nltk_chunking/_benchmarks/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..45394ba547395d128ca565d078a1007864a6dbb3 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_1.txt @@ -0,0 +1,2 @@ +[[open-in-colab]] +Let's take a look at how 🤗 Transformers models can be benchmarked, best practices, and already available benchmarks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_10.txt b/chunked/nltk_chunking/_benchmarks/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfb797c576ea4f8e2555001e028d9e0a18f79477 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_10.txt @@ -0,0 +1,3 @@ +The argument models is required and expects a list of model identifiers from the +model hub The list arguments batch_sizes and sequence_lengths define +the size of the input_ids on which the model is benchmarked. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_11.txt b/chunked/nltk_chunking/_benchmarks/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..eaf551c76fa2facc5287a38a3775e39440816154 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_11.txt @@ -0,0 +1,2 @@ +There are many more parameters that can be configured +via the benchmark argument data classes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_12.txt b/chunked/nltk_chunking/_benchmarks/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bf79525da625e64331c88e2ed26ae90aeb1394f --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_12.txt @@ -0,0 +1,3 @@ +For more detail on these one can either directly consult the files +src/transformers/benchmark/benchmark_args_utils.py, src/transformers/benchmark/benchmark_args.py (for PyTorch) +and src/transformers/benchmark/benchmark_args_tf.py (for Tensorflow). \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_13.txt b/chunked/nltk_chunking/_benchmarks/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..deebdc19ebd56d52260f8f047f7345775ad180b2 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_13.txt @@ -0,0 +1,3 @@ +Alternatively, running the following shell +commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow +respectively. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_14.txt b/chunked/nltk_chunking/_benchmarks/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..c702e39a890590542ebab55f4227faf9d04bdd03 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_14.txt @@ -0,0 +1,2 @@ +python examples/pytorch/benchmarking/run_benchmark.py --help +An instantiated benchmark object can then simply be run by calling benchmark.run(). \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_15.txt b/chunked/nltk_chunking/_benchmarks/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6624891a990c4cdd42997cc37b111e2ffecdbe65 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_15.txt @@ -0,0 +1,45 @@ +results = benchmark.run() +print(results) +==================== INFERENCE - SPEED - RESULT ==================== + +Model Name Batch Size Seq Length Time in s +google-bert/bert-base-uncased 8 8 0.006 +google-bert/bert-base-uncased 8 32 0.006 +google-bert/bert-base-uncased 8 128 0.018 +google-bert/bert-base-uncased 8 512 0.088 + +==================== INFERENCE - MEMORY - RESULT ==================== +Model Name Batch Size Seq Length Memory in MB +google-bert/bert-base-uncased 8 8 1227 +google-bert/bert-base-uncased 8 32 1281 +google-bert/bert-base-uncased 8 128 1307 +google-bert/bert-base-uncased 8 512 1539 + +==================== ENVIRONMENT INFORMATION ==================== + +transformers_version: 2.11.0 +framework: PyTorch +use_torchscript: False +framework_version: 1.4.0 +python_version: 3.6.10 +system: Linux +cpu: x86_64 +architecture: 64bit +date: 2020-06-29 +time: 08:58:43.371351 +fp16: False +use_multiprocessing: True +only_pretrain_model: False +cpu_ram_mb: 32088 +use_gpu: True +num_gpus: 1 +gpu: TITAN RTX +gpu_ram_mb: 24217 +gpu_power_watts: 280.0 +gpu_performance_state: 2 +use_tpu: False + +bash +python examples/tensorflow/benchmarking/run_benchmark_tf.py --help + +An instantiated benchmark object can then simply be run by calling benchmark.run(). \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_16.txt b/chunked/nltk_chunking/_benchmarks/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a025746209c543024ac41f2ac8d130ef3c9546f --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_16.txt @@ -0,0 +1,44 @@ +results = benchmark.run() +print(results) +results = benchmark.run() +print(results) +==================== INFERENCE - SPEED - RESULT ==================== + +Model Name Batch Size Seq Length Time in s +google-bert/bert-base-uncased 8 8 0.005 +google-bert/bert-base-uncased 8 32 0.008 +google-bert/bert-base-uncased 8 128 0.022 +google-bert/bert-base-uncased 8 512 0.105 + +==================== INFERENCE - MEMORY - RESULT ==================== +Model Name Batch Size Seq Length Memory in MB +google-bert/bert-base-uncased 8 8 1330 +google-bert/bert-base-uncased 8 32 1330 +google-bert/bert-base-uncased 8 128 1330 +google-bert/bert-base-uncased 8 512 1770 + +==================== ENVIRONMENT INFORMATION ==================== + +transformers_version: 2.11.0 +framework: Tensorflow +use_xla: False +framework_version: 2.2.0 +python_version: 3.6.10 +system: Linux +cpu: x86_64 +architecture: 64bit +date: 2020-06-29 +time: 09:26:35.617317 +fp16: False +use_multiprocessing: True +only_pretrain_model: False +cpu_ram_mb: 32088 +use_gpu: True +num_gpus: 1 +gpu: TITAN RTX +gpu_ram_mb: 24217 +gpu_power_watts: 280.0 +gpu_performance_state: 2 +use_tpu: False + +By default, the time and the required memory for inference are benchmarked. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_17.txt b/chunked/nltk_chunking/_benchmarks/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d1c74afd87d993853915c9ba9ef1e9fc9021d57 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_17.txt @@ -0,0 +1,2 @@ +In the example output above the first +two sections show the result corresponding to inference time and inference memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_18.txt b/chunked/nltk_chunking/_benchmarks/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fa90b2da730d6a8b98e912d3c9d63c17cb47573 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_18.txt @@ -0,0 +1,2 @@ +In addition, all relevant +information about the computing environment, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_19.txt b/chunked/nltk_chunking/_benchmarks/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1ae357509a583442d6a6d27817ef8b145f5ab62 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_19.txt @@ -0,0 +1,2 @@ +the GPU type, the system, the library versions, etc are printed +out in the third section under ENVIRONMENT INFORMATION. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_2.txt b/chunked/nltk_chunking/_benchmarks/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..65f9e24a6065ee73d9275e1f5c6d100590221fd2 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_2.txt @@ -0,0 +1 @@ +A notebook explaining in more detail how to benchmark 🤗 Transformers models can be found here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_20.txt b/chunked/nltk_chunking/_benchmarks/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca40f887676ec7d4963640db1ded9b2ff13d26c1 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_20.txt @@ -0,0 +1,3 @@ +This information can optionally be saved in a .csv file +when adding the argument save_to_csv=True to [PyTorchBenchmarkArguments] and +[TensorFlowBenchmarkArguments] respectively. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_21.txt b/chunked/nltk_chunking/_benchmarks/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..176d5ac743d279a680d16d6e49bc23c7a9d64e8c --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_21.txt @@ -0,0 +1,2 @@ +In this case, every section is saved in a separate +.csv file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_22.txt b/chunked/nltk_chunking/_benchmarks/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa31670521080b114c392bf6ce2193a23f721add --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_22.txt @@ -0,0 +1 @@ +The path to each .csv file can optionally be defined via the argument data classes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_23.txt b/chunked/nltk_chunking/_benchmarks/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb1fec35ecf1e619e022bff74e6669c40423e889 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_23.txt @@ -0,0 +1 @@ +Instead of benchmarking pre-trained models via their model identifier, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_24.txt b/chunked/nltk_chunking/_benchmarks/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..3303d7c89b7dbde7b9cf06ff02e08f4f46088fae --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_24.txt @@ -0,0 +1,2 @@ +google-bert/bert-base-uncased, the user can +alternatively benchmark an arbitrary configuration of any available model class. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_25.txt b/chunked/nltk_chunking/_benchmarks/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3ef13ab205ed8ccfd60368bc9ef728f0e663f68 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_25.txt @@ -0,0 +1,2 @@ +In this case, a list of +configurations must be inserted with the benchmark args as follows. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_26.txt b/chunked/nltk_chunking/_benchmarks/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9c71f5f1176501b41692fbc3b48f80f6dff3b53 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_26.txt @@ -0,0 +1,133 @@ +from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig +args = PyTorchBenchmarkArguments( + models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] + ) +config_base = BertConfig() +config_384_hid = BertConfig(hidden_size=384) +config_6_lay = BertConfig(num_hidden_layers=6) +benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== + +Model Name Batch Size Seq Length Time in s +bert-base 8 128 0.006 +bert-base 8 512 0.006 +bert-base 8 128 0.018 +bert-base 8 512 0.088 +bert-384-hid 8 8 0.006 +bert-384-hid 8 32 0.006 +bert-384-hid 8 128 0.011 +bert-384-hid 8 512 0.054 +bert-6-lay 8 8 0.003 +bert-6-lay 8 32 0.004 +bert-6-lay 8 128 0.009 +bert-6-lay 8 512 0.044 + +==================== INFERENCE - MEMORY - RESULT ==================== +Model Name Batch Size Seq Length Memory in MB +bert-base 8 8 1277 +bert-base 8 32 1281 +bert-base 8 128 1307 +bert-base 8 512 1539 +bert-384-hid 8 8 1005 +bert-384-hid 8 32 1027 +bert-384-hid 8 128 1035 +bert-384-hid 8 512 1255 +bert-6-lay 8 8 1097 +bert-6-lay 8 32 1101 +bert-6-lay 8 128 1127 +bert-6-lay 8 512 1359 + +==================== ENVIRONMENT INFORMATION ==================== + +transformers_version: 2.11.0 +framework: PyTorch +use_torchscript: False +framework_version: 1.4.0 +python_version: 3.6.10 +system: Linux +cpu: x86_64 +architecture: 64bit +date: 2020-06-29 +time: 09:35:25.143267 +fp16: False +use_multiprocessing: True +only_pretrain_model: False +cpu_ram_mb: 32088 +use_gpu: True +num_gpus: 1 +gpu: TITAN RTX +gpu_ram_mb: 24217 +gpu_power_watts: 280.0 +gpu_performance_state: 2 +use_tpu: False + +py + +from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig + +args = TensorFlowBenchmarkArguments( + models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] + ) +config_base = BertConfig() +config_384_hid = BertConfig(hidden_size=384) +config_6_lay = BertConfig(num_hidden_layers=6) +benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay]) +benchmark.run() +==================== INFERENCE - SPEED - RESULT ==================== + +Model Name Batch Size Seq Length Time in s +bert-base 8 8 0.005 +bert-base 8 32 0.008 +bert-base 8 128 0.022 +bert-base 8 512 0.106 +bert-384-hid 8 8 0.005 +bert-384-hid 8 32 0.007 +bert-384-hid 8 128 0.018 +bert-384-hid 8 512 0.064 +bert-6-lay 8 8 0.002 +bert-6-lay 8 32 0.003 +bert-6-lay 8 128 0.0011 +bert-6-lay 8 512 0.074 + +==================== INFERENCE - MEMORY - RESULT ==================== +Model Name Batch Size Seq Length Memory in MB +bert-base 8 8 1330 +bert-base 8 32 1330 +bert-base 8 128 1330 +bert-base 8 512 1770 +bert-384-hid 8 8 1330 +bert-384-hid 8 32 1330 +bert-384-hid 8 128 1330 +bert-384-hid 8 512 1540 +bert-6-lay 8 8 1330 +bert-6-lay 8 32 1330 +bert-6-lay 8 128 1330 +bert-6-lay 8 512 1540 + +==================== ENVIRONMENT INFORMATION ==================== + +transformers_version: 2.11.0 +framework: Tensorflow +use_xla: False +framework_version: 2.2.0 +python_version: 3.6.10 +system: Linux +cpu: x86_64 +architecture: 64bit +date: 2020-06-29 +time: 09:38:15.487125 +fp16: False +use_multiprocessing: True +only_pretrain_model: False +cpu_ram_mb: 32088 +use_gpu: True +num_gpus: 1 +gpu: TITAN RTX +gpu_ram_mb: 24217 +gpu_power_watts: 280.0 +gpu_performance_state: 2 +use_tpu: False + +Again, inference time and required memory for inference are measured, but this time for customized configurations +of the BertModel class. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_27.txt b/chunked/nltk_chunking/_benchmarks/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca087fd2ee84b11dd01016313f2c5fd0b798a82e --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_27.txt @@ -0,0 +1,2 @@ +This feature can especially be helpful when deciding for which configuration the model +should be trained. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_28.txt b/chunked/nltk_chunking/_benchmarks/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..158e3739f0bffd36dd0d6e0cd51d3abb39082408 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_28.txt @@ -0,0 +1,2 @@ +Benchmark best practices +This section lists a couple of best practices one should be aware of when benchmarking a model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_29.txt b/chunked/nltk_chunking/_benchmarks/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c12848ec134b153883880fcc55f1e5afa3af6a1 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_29.txt @@ -0,0 +1 @@ +Currently, only single device benchmarking is supported. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_3.txt b/chunked/nltk_chunking/_benchmarks/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5a8c7135bde5652488725f739428338cd4c0696 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_3.txt @@ -0,0 +1,2 @@ +How to benchmark 🤗 Transformers models +The classes [PyTorchBenchmark] and [TensorFlowBenchmark] allow to flexibly benchmark 🤗 Transformers models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_30.txt b/chunked/nltk_chunking/_benchmarks/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..910a3c7a8729ed1f88ef39aff6458e376490e26f --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_30.txt @@ -0,0 +1,3 @@ +When benchmarking on GPU, it is recommended that the user + specifies on which device the code should be run by setting the CUDA_VISIBLE_DEVICES environment variable in the + shell, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_31.txt b/chunked/nltk_chunking/_benchmarks/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec0e80c1b85972171508a36374f8a8d7abb74f4c --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_31.txt @@ -0,0 +1 @@ +export CUDA_VISIBLE_DEVICES=0 before running the code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_32.txt b/chunked/nltk_chunking/_benchmarks/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..1985650a972923322b9f95ac73bdda9ce5739dab --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_32.txt @@ -0,0 +1 @@ +The option no_multi_processing should only be set to True for testing and debugging. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_33.txt b/chunked/nltk_chunking/_benchmarks/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..cad300a9231cdd48f6fd9bba8216cb876b6245b6 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_33.txt @@ -0,0 +1,3 @@ +To ensure accurate + memory measurement it is recommended to run each memory benchmark in a separate process by making sure + no_multi_processing is set to True. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_34.txt b/chunked/nltk_chunking/_benchmarks/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4b4559962f69e5acb21864349986b911e0b37fb --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_34.txt @@ -0,0 +1 @@ +One should always state the environment information when sharing the results of a model benchmark. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_35.txt b/chunked/nltk_chunking/_benchmarks/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad4194b05d120ef8b0625c0f28cefaa19e3bd43a --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_35.txt @@ -0,0 +1,3 @@ +Results can vary + heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very + useful for the community. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_36.txt b/chunked/nltk_chunking/_benchmarks/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3d83ff6d1dd256a7449880caabad6ca3b04acff --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_36.txt @@ -0,0 +1,3 @@ +Sharing your benchmark +Previously all available core models (10 at the time) have been benchmarked for inference time, across many different +settings: using PyTorch, with and without TorchScript, using TensorFlow, with and without XLA. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_37.txt b/chunked/nltk_chunking/_benchmarks/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..62ad659ab08a20da2f5132d3d45c9aa580abd102 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_37.txt @@ -0,0 +1,2 @@ +All of those tests were +done across CPUs (except for TensorFlow XLA) and GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_38.txt b/chunked/nltk_chunking/_benchmarks/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a5e3f33d47dbb5996f7cbbac6f7251ae87a3d63 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_38.txt @@ -0,0 +1,2 @@ +The approach is detailed in the following blogpost and the results are +available here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_39.txt b/chunked/nltk_chunking/_benchmarks/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..9856375638a9cd3a3fcd225a21d5eda572e48cbb --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_39.txt @@ -0,0 +1,3 @@ +With the new benchmark tools, it is easier than ever to share your benchmark results with the community + +PyTorch Benchmarking Results. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_4.txt b/chunked/nltk_chunking/_benchmarks/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a6c9602ff7ce7cf9e97f56906268ed814fb53da --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_4.txt @@ -0,0 +1 @@ +The benchmark classes allow us to measure the peak memory usage and required time for both inference and training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_40.txt b/chunked/nltk_chunking/_benchmarks/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..a59e42fe26b7b2e616dd25c265bbb5027e49de98 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_40.txt @@ -0,0 +1 @@ +TensorFlow Benchmarking Results. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_5.txt b/chunked/nltk_chunking/_benchmarks/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d7cd1293a51a63cc655d077d4ebd980c8dad7bf --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_5.txt @@ -0,0 +1,2 @@ +Hereby, inference is defined by a single forward pass, and training is defined by a single forward pass and +backward pass. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_6.txt b/chunked/nltk_chunking/_benchmarks/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e7c39da1277ef321980804fa98f988465a08dd5 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_6.txt @@ -0,0 +1,2 @@ +The benchmark classes [PyTorchBenchmark] and [TensorFlowBenchmark] expect an object of type [PyTorchBenchmarkArguments] and +[TensorFlowBenchmarkArguments], respectively, for instantiation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_7.txt b/chunked/nltk_chunking/_benchmarks/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..d17b3c0f76e73a2e10c88c59107f9f37f92ea61f --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_7.txt @@ -0,0 +1 @@ +[PyTorchBenchmarkArguments] and [TensorFlowBenchmarkArguments] are data classes and contain all relevant configurations for their corresponding benchmark class. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_8.txt b/chunked/nltk_chunking/_benchmarks/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e379b0070d9f7b48475a377755cf98eb7ed9135 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_8.txt @@ -0,0 +1 @@ +In the following example, it is shown how a BERT model of type bert-base-cased can be benchmarked. \ No newline at end of file diff --git a/chunked/nltk_chunking/_benchmarks/chunk_9.txt b/chunked/nltk_chunking/_benchmarks/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..c05f0cc8466bd75a5e073f3565091caf9278de24 --- /dev/null +++ b/chunked/nltk_chunking/_benchmarks/chunk_9.txt @@ -0,0 +1,13 @@ +from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments +args = PyTorchBenchmarkArguments(models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512]) +benchmark = PyTorchBenchmark(args) + +py +from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments +args = TensorFlowBenchmarkArguments( + models=["google-bert/bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512] + ) +benchmark = TensorFlowBenchmark(args) + +Here, three arguments are given to the benchmark argument data classes, namely models, batch_sizes, and +sequence_lengths. \ No newline at end of file diff --git a/chunked/nltk_chunking/_bertology/chunk_0.txt b/chunked/nltk_chunking/_bertology/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..74e4265ba72d3a67f6ca400d76e66b021e98a2b2 --- /dev/null +++ b/chunked/nltk_chunking/_bertology/chunk_0.txt @@ -0,0 +1,4 @@ + +BERTology +There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT +(that some call "BERTology"). \ No newline at end of file diff --git a/chunked/nltk_chunking/_bertology/chunk_1.txt b/chunked/nltk_chunking/_bertology/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d53420de2e7d83eec0c3260d148a2f1bf54b57b --- /dev/null +++ b/chunked/nltk_chunking/_bertology/chunk_1.txt @@ -0,0 +1,5 @@ +Some good examples of this field are: + +BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: + https://arxiv.org/abs/1905.05950 +Are Sixteen Heads Really Better than One? \ No newline at end of file diff --git a/chunked/nltk_chunking/_bertology/chunk_2.txt b/chunked/nltk_chunking/_bertology/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbc0e0a94f2a026a5d52568af4fd9d199246feac --- /dev/null +++ b/chunked/nltk_chunking/_bertology/chunk_2.txt @@ -0,0 +1,2 @@ +by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650 +What Does BERT Look At? \ No newline at end of file diff --git a/chunked/nltk_chunking/_bertology/chunk_3.txt b/chunked/nltk_chunking/_bertology/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0c66c3d9e32ecac7c0eff0e649d09e4dc1dc66c --- /dev/null +++ b/chunked/nltk_chunking/_bertology/chunk_3.txt @@ -0,0 +1,12 @@ +An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. + Manning: https://arxiv.org/abs/1906.04341 +CAT-probing: A Metric-based Approach to Interpret How Pre-trained Models for Programming Language Attend Code Structure: https://arxiv.org/abs/2210.04633 + +In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to +help people access the inner representations, mainly adapted from the great work of Paul Michel +(https://arxiv.org/abs/1905.10650): + +accessing all the hidden-states of BERT/GPT/GPT-2, +accessing all the attention weights for each head of BERT/GPT/GPT-2, +retrieving heads output values and gradients to be able to compute head importance score and prune head as explained + in https://arxiv.org/abs/1905.10650. \ No newline at end of file diff --git a/chunked/nltk_chunking/_bertology/chunk_4.txt b/chunked/nltk_chunking/_bertology/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..f265fbff154f980ea4b6c5d88136d78324bf867a --- /dev/null +++ b/chunked/nltk_chunking/_bertology/chunk_4.txt @@ -0,0 +1,2 @@ +To help you understand and use these features, we have added a specific example script: bertology.py while extract information and prune a model pre-trained on +GLUE. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_0.txt b/chunked/nltk_chunking/_big_models/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8d0c25d9b5310c401a65d44c4eeab2999317157 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_0.txt @@ -0,0 +1,3 @@ + +Instantiating a big model +When you want to use a very big pretrained model, one challenge is to minimize the use of the RAM. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_1.txt b/chunked/nltk_chunking/_big_models/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bc0731f4d3785c6a172640a6d4ade98578ded71 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_1.txt @@ -0,0 +1,4 @@ +The usual workflow +from PyTorch is: + +Create your model with random weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_10.txt b/chunked/nltk_chunking/_big_models/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..7af41af1f564d224baae1d815f382effe49da055 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_10.txt @@ -0,0 +1,2 @@ +Sharded checkpoints +Since version 4.18.0, model checkpoints that end up taking more than 10GB of space are automatically sharded in smaller pieces. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_11.txt b/chunked/nltk_chunking/_big_models/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d88be6e29ccc395d0e924df7de9030f6a3c3e5a --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_11.txt @@ -0,0 +1 @@ +In terms of having one single checkpoint when you do model.save_pretrained(save_dir), you will end up with several partial checkpoints (each of which being of size < 10GB) and an index that maps parameter names to the files they are stored in. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_12.txt b/chunked/nltk_chunking/_big_models/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..89e53565779206d7ce99a9528429b7ec6e4416ba --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_12.txt @@ -0,0 +1 @@ +You can control the maximum size before sharding with the max_shard_size parameter, so for the sake of an example, we'll use a normal-size models with a small shard size: let's take a traditional BERT model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_13.txt b/chunked/nltk_chunking/_big_models/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..279adf22bf4b69262e42d0a3a910ddbacca2afd0 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_13.txt @@ -0,0 +1,20 @@ +from transformers import AutoModel +model = AutoModel.from_pretrained("google-bert/bert-base-cased") + +If you save it using [~PreTrainedModel.save_pretrained], you will get a new folder with two files: the config of the model and its weights: + +import os +import tempfile +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + print(sorted(os.listdir(tmp_dir))) +['config.json', 'pytorch_model.bin'] + +Now let's use a maximum shard size of 200MB: + +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="200MB") + print(sorted(os.listdir(tmp_dir))) +['config.json', 'pytorch_model-00001-of-00003.bin', 'pytorch_model-00002-of-00003.bin', 'pytorch_model-00003-of-00003.bin', 'pytorch_model.bin.index.json'] + +On top of the configuration of the model, we see three different weights files, and an index.json file which is our index. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_14.txt b/chunked/nltk_chunking/_big_models/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c7a38e87e6d655fd2026badd3504c8612e4943f --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_14.txt @@ -0,0 +1,7 @@ +A checkpoint like this can be fully reloaded using the [~PreTrainedModel.from_pretrained] method: + +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="200MB") + new_model = AutoModel.from_pretrained(tmp_dir) + +The main advantage of doing this for big models is that during step 2 of the workflow shown above, each shard of the checkpoint is loaded after the previous one, capping the memory usage in RAM to the model size plus the size of the biggest shard. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_15.txt b/chunked/nltk_chunking/_big_models/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..956cc5afd62cbfb6cf25a10397d57e2741a16051 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_15.txt @@ -0,0 +1 @@ +Behind the scenes, the index file is used to determine which keys are in the checkpoint, and where the corresponding weights are stored. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_16.txt b/chunked/nltk_chunking/_big_models/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..212b0e52c4b7a78161975b2a0fc040c3426d5412 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_16.txt @@ -0,0 +1,11 @@ +We can load that index like any json and get a dictionary: + +import json +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="200MB") + with open(os.path.join(tmp_dir, "pytorch_model.bin.index.json"), "r") as f: + index = json.load(f) +print(index.keys()) +dict_keys(['metadata', 'weight_map']) + +The metadata just consists of the total size of the model for now. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_17.txt b/chunked/nltk_chunking/_big_models/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..be64db07863276b0e842a5e17835db38da37b244 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_17.txt @@ -0,0 +1,21 @@ +We plan to add other information in the future: + +index["metadata"] +{'total_size': 433245184} + +The weights map is the main part of this index, which maps each parameter name (as usually found in a PyTorch model state_dict) to the file it's stored in: + +index["weight_map"] +{'embeddings.LayerNorm.bias': 'pytorch_model-00001-of-00003.bin', + 'embeddings.LayerNorm.weight': 'pytorch_model-00001-of-00003.bin', + + +If you want to directly load such a sharded checkpoint inside a model without using [~PreTrainedModel.from_pretrained] (like you would do model.load_state_dict() for a full checkpoint) you should use [~modeling_utils.load_sharded_checkpoint]: + +from transformers.modeling_utils import load_sharded_checkpoint +with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir, max_shard_size="200MB") + load_sharded_checkpoint(model, tmp_dir) + +Low memory loading +Sharded checkpoints reduce the memory usage during step 2 of the workflow mentioned above, but in order to use that model in a low memory setting, we recommend leveraging our tools based on the Accelerate library. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_18.txt b/chunked/nltk_chunking/_big_models/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed6342a302a74de371570cd62848722596549f9f --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_18.txt @@ -0,0 +1 @@ +Please read the following guide for more information: Large model loading using Accelerate \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_2.txt b/chunked/nltk_chunking/_big_models/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..3180470aa30924e164ef538d4e3e9d30afd7464c --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_2.txt @@ -0,0 +1 @@ +Load your pretrained weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_3.txt b/chunked/nltk_chunking/_big_models/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3bf7e34b933b5ef3b93577f99933307cb6f6648 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_3.txt @@ -0,0 +1 @@ +Put those pretrained weights in your random model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_4.txt b/chunked/nltk_chunking/_big_models/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cffdfaf38140f42729d5817dd89d5d39be73ebe --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_4.txt @@ -0,0 +1 @@ +Step 1 and 2 both require a full version of the model in memory, which is not a problem in most cases, but if your model starts weighing several GigaBytes, those two copies can make you get out of RAM. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_5.txt b/chunked/nltk_chunking/_big_models/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..38a57379e819dc86bdd6b95c772923f70097c348 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_5.txt @@ -0,0 +1 @@ +Even worse, if you are using torch.distributed to launch a distributed training, each process will load the pretrained model and store these two copies in RAM. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_6.txt b/chunked/nltk_chunking/_big_models/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..48c8a2cc4d3c1627a022ed3c736f314009cff362 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_6.txt @@ -0,0 +1 @@ +Note that the randomly created model is initialized with "empty" tensors, which take the space in memory without filling it (thus the random values are whatever was in this chunk of memory at a given time). \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_7.txt b/chunked/nltk_chunking/_big_models/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..d662dc6e8cd1a3fafaba0d6efe4c1b0f18ccaa4a --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_7.txt @@ -0,0 +1 @@ +The random initialization following the appropriate distribution for the kind of model/parameters instantiated (like a normal distribution for instance) is only performed after step 3 on the non-initialized weights, to be as fast as possible! \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_8.txt b/chunked/nltk_chunking/_big_models/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f8e6692e212b7973346f4c6541f2c83aeecf085 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_8.txt @@ -0,0 +1 @@ +In this guide, we explore the solutions Transformers offer to deal with this issue. \ No newline at end of file diff --git a/chunked/nltk_chunking/_big_models/chunk_9.txt b/chunked/nltk_chunking/_big_models/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..c89bc83385ba6dd7256cce269f67903f85601bb6 --- /dev/null +++ b/chunked/nltk_chunking/_big_models/chunk_9.txt @@ -0,0 +1 @@ +Note that this is an area of active development, so the APIs explained here may change slightly in the future. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_0.txt b/chunked/nltk_chunking/_chat_templating/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..234b0afd93b14aed9fa2ebf68801f67123747d77 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_0.txt @@ -0,0 +1,4 @@ + +Templates for Chat Models +Introduction +An increasingly common use case for LLMs is chat. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_1.txt b/chunked/nltk_chunking/_chat_templating/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1b5f12f6db1fcdae7e0d070cb4a2b5d7168e5f8 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_1.txt @@ -0,0 +1,3 @@ +In a chat context, rather than continuing a single string +of text (as is the case with a standard language model), the model instead continues a conversation that consists +of one or more messages, each of which includes a role, like "user" or "assistant", as well as message text. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_10.txt b/chunked/nltk_chunking/_chat_templating/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..cff4faed941c6ed3d972de75fe2453858cd0cad4 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_10.txt @@ -0,0 +1,2 @@ +"}, + {"role": "user", "content": "I'd like to show off how chat templating works! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_100.txt b/chunked/nltk_chunking/_chat_templating/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..a17d6078bb377ee2274a3f32ed4c4e31ed81a683 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_100.txt @@ -0,0 +1,11 @@ +{% for message in messages %} + {% if message['role'] == 'user' %} + {{ bos_token + '[INST] ' + message['content'] + ' [/INST]' }} + {% elif message['role'] == 'system' %} + {{ '<>\\n' + message['content'] + '\\n<>\\n\\n' }} + {% elif message['role'] == 'assistant' %} + {{ ' ' + message['content'] + ' ' + eos_token }} + {% endif %} +{% endfor %} +Hopefully if you stare at this for a little bit you can see what this template is doing - it adds specific tokens based +on the "role" of each message, which represents who sent it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_101.txt b/chunked/nltk_chunking/_chat_templating/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb8c1b207f23868f6a82d66590836fc00c7d02c4 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_101.txt @@ -0,0 +1,2 @@ +User, assistant and system messages are clearly +distinguishable to the model because of the tokens they're wrapped in. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_102.txt b/chunked/nltk_chunking/_chat_templating/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..cac69a0288165859e207e8a0416bb8a14a2d9ea4 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_102.txt @@ -0,0 +1,2 @@ +Advanced: Adding and editing chat templates +How do I create a chat template? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_103.txt b/chunked/nltk_chunking/_chat_templating/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6005c7e27ad9dc5a4fb9350c8a22204b2cad8e4 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_103.txt @@ -0,0 +1 @@ +Simple, just write a jinja template and set tokenizer.chat_template. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_104.txt b/chunked/nltk_chunking/_chat_templating/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..032f8ca5962b6d6eb69f4eaddcbb7ef16f68b247 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_104.txt @@ -0,0 +1,2 @@ +You may find it easier to start with an +existing template from another model and simply edit it for your needs! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_105.txt b/chunked/nltk_chunking/_chat_templating/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..0aeb8d67a4b03443b82fc0f31d9a2c09552bdcf0 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_105.txt @@ -0,0 +1,12 @@ +For example, we could take the LLaMA template +above and add "[ASST]" and "[/ASST]" to assistant messages: +{% for message in messages %} + {% if message['role'] == 'user' %} + {{ bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }} + {% elif message['role'] == 'system' %} + {{ '<>\\n' + message['content'].strip() + '\\n<>\\n\\n' }} + {% elif message['role'] == 'assistant' %} + {{ '[ASST] ' + message['content'] + ' [/ASST]' + eos_token }} + {% endif %} +{% endfor %} +Now, simply set the tokenizer.chat_template attribute. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_106.txt b/chunked/nltk_chunking/_chat_templating/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..dee8ca1c8ab1e3ff893906e237baec4d69309488 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_106.txt @@ -0,0 +1,2 @@ +Next time you use [~PreTrainedTokenizer.apply_chat_template], it will +use your new template! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_107.txt b/chunked/nltk_chunking/_chat_templating/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..03ddda7b863b5416d7fac12e1d1ba11433d70674 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_107.txt @@ -0,0 +1,3 @@ +This attribute will be saved in the tokenizer_config.json file, so you can use +[~utils.PushToHubMixin.push_to_hub] to upload your new template to the Hub and make sure everyone's using the right +template for your model! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_108.txt b/chunked/nltk_chunking/_chat_templating/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..b9232633e2c385f5ee16e093b041222d93d22525 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_108.txt @@ -0,0 +1,5 @@ +python +template = tokenizer.chat_template +template = template.replace("SYS", "SYSTEM") # Change the system token +tokenizer.chat_template = template # Set the new template +tokenizer.push_to_hub("model_name") # Upload your new template to the Hub! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_109.txt b/chunked/nltk_chunking/_chat_templating/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..b561543a212234afd5bf03b783d4dfa42412c30e --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_109.txt @@ -0,0 +1,2 @@ +The method [~PreTrainedTokenizer.apply_chat_template] which uses your chat template is called by the [TextGenerationPipeline] class, so +once you set the correct chat template, your model will automatically become compatible with [TextGenerationPipeline]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_11.txt b/chunked/nltk_chunking/_chat_templating/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc2636007b150e9260aacc8ae0dee56b7a8c2ec2 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_11.txt @@ -0,0 +1,4 @@ +"}, + ] +tokenizer.apply_chat_template(chat, tokenize=False) +" Hello, how are you? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_110.txt b/chunked/nltk_chunking/_chat_templating/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..74c6d68646a7188b7307b8dc47a471327e08e38d --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_110.txt @@ -0,0 +1,2 @@ +If you're fine-tuning a model for chat, in addition to setting a chat template, you should probably add any new chat +control tokens as special tokens in the tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_111.txt b/chunked/nltk_chunking/_chat_templating/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..3eee7b3d76496cca5142243aede4cab428cef353 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_111.txt @@ -0,0 +1,2 @@ +Special tokens are never split, +ensuring that your control tokens are always handled as single tokens rather than being tokenized in pieces. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_112.txt b/chunked/nltk_chunking/_chat_templating/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d7d2c09217f859275f6ce3ccbe44793e79ae889 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_112.txt @@ -0,0 +1,3 @@ +You +should also set the tokenizer's eos_token attribute to the token that marks the end of assistant generations in your +template. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_113.txt b/chunked/nltk_chunking/_chat_templating/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c2e950b8fbfc98183e1d03eced8f9b0525dc5e2 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_113.txt @@ -0,0 +1 @@ +This will ensure that text generation tools can correctly figure out when to stop generating text. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_114.txt b/chunked/nltk_chunking/_chat_templating/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c34d80eef4b1cf034581888e430cede43ef6138 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_114.txt @@ -0,0 +1 @@ +What are "default" templates? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_115.txt b/chunked/nltk_chunking/_chat_templating/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..27ae01075ada6c8d93e7eb3ad3efd490f6a98a59 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_115.txt @@ -0,0 +1 @@ +Before the introduction of chat templates, chat handling was hardcoded at the model class level. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_116.txt b/chunked/nltk_chunking/_chat_templating/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd086625513d3019538c19159ccf315e18086aec --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_116.txt @@ -0,0 +1,2 @@ +For backwards +compatibility, we have retained this class-specific handling as default templates, also set at the class level. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_117.txt b/chunked/nltk_chunking/_chat_templating/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4457c994f7487897a00a3eb495d54018da83d4c --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_117.txt @@ -0,0 +1,3 @@ +If a +model does not have a chat template set, but there is a default template for its model class, the TextGenerationPipeline +class and methods like apply_chat_template will use the class template instead. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_118.txt b/chunked/nltk_chunking/_chat_templating/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..a84c318c4d2b1d7f223a1c240058fd7eda7527ba --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_118.txt @@ -0,0 +1,2 @@ +You can find out what the default +template for your tokenizer is by checking the tokenizer.default_chat_template attribute. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_119.txt b/chunked/nltk_chunking/_chat_templating/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1cd01a58479c108a9b75df908d94fb6b8079667 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_119.txt @@ -0,0 +1 @@ +This is something we do purely for backward compatibility reasons, to avoid breaking any existing workflows. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_12.txt b/chunked/nltk_chunking/_chat_templating/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7ee57c5df6f14bfb6097c6cf2d36c5e0deca07d --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_12.txt @@ -0,0 +1 @@ +I'm doing great. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_120.txt b/chunked/nltk_chunking/_chat_templating/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..778dd8f0b1add0bd2fcc79fa8ee240d3490a0da3 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_120.txt @@ -0,0 +1,4 @@ +Even when +the class template is appropriate for your model, we strongly recommend overriding the default template by +setting the chat_template attribute explicitly to make it clear to users that your model has been correctly configured +for chat, and to future-proof in case the default templates are ever altered or deprecated. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_121.txt b/chunked/nltk_chunking/_chat_templating/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..24b2313b9dd899e96fbd0786166f2401f69710fe --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_121.txt @@ -0,0 +1 @@ +What template should I use? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_122.txt b/chunked/nltk_chunking/_chat_templating/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..65cb7cd8b7e34495752afdfb9713fb99110d3fef --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_122.txt @@ -0,0 +1,3 @@ +When setting the template for a model that's already been trained for chat, you should ensure that the template +exactly matches the message formatting that the model saw during training, or else you will probably experience +performance degradation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_123.txt b/chunked/nltk_chunking/_chat_templating/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..342755bd82e90c96b18640dac163c138e5d50615 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_123.txt @@ -0,0 +1,2 @@ +This is true even if you're training the model further - you will probably get the best +performance if you keep the chat tokens constant. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_124.txt b/chunked/nltk_chunking/_chat_templating/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..c98df36959dace2e6959508338e6189d3676ab4f --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_124.txt @@ -0,0 +1,2 @@ +This is very analogous to tokenization - you generally get the +best performance for inference or fine-tuning when you precisely match the tokenization used during training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_125.txt b/chunked/nltk_chunking/_chat_templating/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5449710be1b0acc8676953a84c03432fa55d511 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_125.txt @@ -0,0 +1,2 @@ +If you're training a model from scratch, or fine-tuning a base language model for chat, on the other hand, +you have a lot of freedom to choose an appropriate template! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_126.txt b/chunked/nltk_chunking/_chat_templating/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..e280fa0981701d58a0a11ad3af8b0e2a838f9af0 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_126.txt @@ -0,0 +1,2 @@ +LLMs are smart enough to learn to handle lots of different +input formats. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_127.txt b/chunked/nltk_chunking/_chat_templating/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..87b507dc45dd10f8278bca6ab3ec5d50e6de3605 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_127.txt @@ -0,0 +1,2 @@ +Our default template for models that don't have a class-specific template follows the +ChatML format, and this is a good, flexible choice for many use-cases. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_128.txt b/chunked/nltk_chunking/_chat_templating/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8576391a85f4b0855901f350152cb043ca44680 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_128.txt @@ -0,0 +1,5 @@ +It looks like this: +{% for message in messages %} + {{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}} +{% endfor %} +If you like this one, here it is in one-liner form, ready to copy into your code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_129.txt b/chunked/nltk_chunking/_chat_templating/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..f04b1d6ad5d8cd00f85b174c0c202409df9faf58 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_129.txt @@ -0,0 +1,2 @@ +The one-liner also includes +handy support for generation prompts, but note that it doesn't add BOS or EOS tokens! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_13.txt b/chunked/nltk_chunking/_chat_templating/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..33b79472da2a0e868de99b29c7f096532c666814 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_13.txt @@ -0,0 +1 @@ +How can I help you today? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_130.txt b/chunked/nltk_chunking/_chat_templating/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d4d34329cc6c2e10ba2df94449a112906bcc337 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_130.txt @@ -0,0 +1,2 @@ +If your model expects those, they won't be added automatically by apply_chat_template - in other words, the +text will be tokenized with add_special_tokens=False. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_131.txt b/chunked/nltk_chunking/_chat_templating/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..625773ee95037d97fd3f70a9eac41c3e378b460f --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_131.txt @@ -0,0 +1,2 @@ +This is to avoid potential conflicts between the template and +the add_special_tokens logic. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_132.txt b/chunked/nltk_chunking/_chat_templating/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..71c8a12a7eca727948696566286376e9dd7741e3 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_132.txt @@ -0,0 +1 @@ +If your model expects special tokens, make sure to add them to the template! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_133.txt b/chunked/nltk_chunking/_chat_templating/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..c210a893f1e2412d36583aaaf9b95e04d1c91910 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_133.txt @@ -0,0 +1,4 @@ +python +tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}" +This template wraps each message in <|im_start|> and <|im_end|> tokens, and simply writes the role as a string, which +allows for flexibility in the roles you train with. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_134.txt b/chunked/nltk_chunking/_chat_templating/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..b89f99a512226c9158a3cc946d71a90bd1cdd72a --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_134.txt @@ -0,0 +1,10 @@ +The output looks like this: +text +<|im_start|>system +You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|> +<|im_start|>user +How are you?<|im_end|> +<|im_start|>assistant +I'm doing great!<|im_end|> +The "user", "system" and "assistant" roles are the standard for chat, and we recommend using them when it makes sense, +particularly if you want your model to operate well with [TextGenerationPipeline]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_135.txt b/chunked/nltk_chunking/_chat_templating/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2a077912d8405be5d431a66ab2f98115b33def0 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_135.txt @@ -0,0 +1,2 @@ +However, you are not limited +to these roles - templating is extremely flexible, and any string can be a role. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_136.txt b/chunked/nltk_chunking/_chat_templating/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b248f3df72b2c7078607b45efc0f1b66fdd4ebe --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_136.txt @@ -0,0 +1 @@ +I want to add some chat templates! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_137.txt b/chunked/nltk_chunking/_chat_templating/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1281123e7dd8841e9d9d1b2b9b9662d6688750a --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_137.txt @@ -0,0 +1 @@ +How should I get started? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_138.txt b/chunked/nltk_chunking/_chat_templating/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..52a0bdb2ff5e17351d63e5be007b615cee864edb --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_138.txt @@ -0,0 +1,2 @@ +If you have any chat models, you should set their tokenizer.chat_template attribute and test it using +[~PreTrainedTokenizer.apply_chat_template], then push the updated tokenizer to the Hub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_139.txt b/chunked/nltk_chunking/_chat_templating/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6f76afd9c67ee62cb3f82bcc2e04aa20ee597b0 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_139.txt @@ -0,0 +1,3 @@ +This applies even if you're +not the model owner - if you're using a model with an empty chat template, or one that's still using the default class +template, please open a pull request to the model repository so that this attribute can be set properly! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_14.txt b/chunked/nltk_chunking/_chat_templating/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe272434dc3d5781299894bb017d39ecb008cae5 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_14.txt @@ -0,0 +1 @@ +I'd like to show off how chat templating works!" \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_140.txt b/chunked/nltk_chunking/_chat_templating/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..070a2404d352791c7502730feb0e76b107b29128 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_140.txt @@ -0,0 +1 @@ +Once the attribute is set, that's it, you're done! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_141.txt b/chunked/nltk_chunking/_chat_templating/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8ad787f05e2149e6750588abda9ee0559e302fb --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_141.txt @@ -0,0 +1,2 @@ +tokenizer.apply_chat_template will now work correctly for that +model, which means it is also automatically supported in places like TextGenerationPipeline! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_142.txt b/chunked/nltk_chunking/_chat_templating/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..178a8068f08de2637c30e10a478b6563a602e58b --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_142.txt @@ -0,0 +1,2 @@ +By ensuring that models have this attribute, we can make sure that the whole community gets to use the full power of +open-source models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_143.txt b/chunked/nltk_chunking/_chat_templating/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..aec84b1ee27955d306965f00655eac84ebce27aa --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_143.txt @@ -0,0 +1,2 @@ +Formatting mismatches have been haunting the field and silently harming performance for too long - +it's time to put an end to them! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_144.txt b/chunked/nltk_chunking/_chat_templating/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d958e93cae7e582f9da664712e441025d48063b --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_144.txt @@ -0,0 +1,3 @@ +Advanced: Template writing tips +If you're unfamiliar with Jinja, we generally find that the easiest way to write a chat template is to first +write a short Python script that formats messages the way you want, and then convert that script into a template. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_145.txt b/chunked/nltk_chunking/_chat_templating/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0a8da35d0e58c32cb2f1215b308e9ddc6bafff6 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_145.txt @@ -0,0 +1 @@ +Remember that the template handler will receive the conversation history as a variable called messages. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_146.txt b/chunked/nltk_chunking/_chat_templating/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b3ce11866fd407b32683e0e2700f23092c0a149 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_146.txt @@ -0,0 +1,2 @@ +Each +message is a dictionary with two keys, role and content. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_147.txt b/chunked/nltk_chunking/_chat_templating/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..574e82f9f72f46bd44c2a338ee5d9dad71fd16b7 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_147.txt @@ -0,0 +1,3 @@ +You will be able to access messages in your template +just like you can in Python, which means you can loop over it with {% for message in messages %} or access +individual messages with, for example, {{ messages[0] }}. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_148.txt b/chunked/nltk_chunking/_chat_templating/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d418e891dd29049e7fb034ccf0a25b35249b6f7 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_148.txt @@ -0,0 +1,7 @@ +You can also use the following tips to convert your code to Jinja: +For loops +For loops in Jinja look like this: +{% for message in messages %} +{{ message['content'] }} +{% endfor %} +Note that whatever's inside the {{ expression block }} will be printed to the output. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_149.txt b/chunked/nltk_chunking/_chat_templating/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe54aef800714b7675be0f2c2fa41659277888ac --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_149.txt @@ -0,0 +1,2 @@ +You can use operators like ++ to combine strings inside expression blocks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_15.txt b/chunked/nltk_chunking/_chat_templating/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3179459d8bfa446ca9e147c9d10ceb856bc905a5 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_15.txt @@ -0,0 +1 @@ +Notice how the entire chat is condensed into a single string. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_150.txt b/chunked/nltk_chunking/_chat_templating/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ccae163d04f41db1e4ae398e535396815529959 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_150.txt @@ -0,0 +1,7 @@ +If statements +If statements in Jinja look like this: +{% if message['role'] == 'user' %} +{{ message['content'] }} +{% endif %} +Note how where Python uses whitespace to mark the beginnings and ends of for and if blocks, Jinja requires you +to explicitly end them with {% endfor %} and {% endif %}. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_151.txt b/chunked/nltk_chunking/_chat_templating/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..0afc4c436909b2a570271270a6e22eaf3ac06abe --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_151.txt @@ -0,0 +1,3 @@ +Special variables +Inside your template, you will have access to the list of messages, but you can also access several other special +variables. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_152.txt b/chunked/nltk_chunking/_chat_templating/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..9605efc8eb7d6716d9416a7a90f5b831c137ddb1 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_152.txt @@ -0,0 +1,2 @@ +These include special tokens like bos_token and eos_token, as well as the add_generation_prompt +variable that we discussed above. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_153.txt b/chunked/nltk_chunking/_chat_templating/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..b280f14293d43216604451c069a9f5b31c2cded6 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_153.txt @@ -0,0 +1,3 @@ +You can also use the loop variable to access information about the current loop +iteration, for example using {% if loop.last %} to check if the current message is the last message in the +conversation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_154.txt b/chunked/nltk_chunking/_chat_templating/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..12924d2af1d87971a4a8cc46f490f2e5fd50f5ac --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_154.txt @@ -0,0 +1,7 @@ +Here's an example that puts these ideas together to add a generation prompt at the end of the +conversation if add_generation_prompt is True: +{% if loop.last and add_generation_prompt %} +{{ bos_token + 'Assistant:\n' }} +{% endif %} +Notes on whitespace +As much as possible, we've tried to get Jinja to ignore whitespace outside of {{ expressions }}. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_155.txt b/chunked/nltk_chunking/_chat_templating/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f4589320659e1c1a27f762fb8e529c5c1c2af26 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_155.txt @@ -0,0 +1,3 @@ +However, be aware +that Jinja is a general-purpose templating engine, and it may treat whitespace between blocks on the same line +as significant and print it to the output. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_156.txt b/chunked/nltk_chunking/_chat_templating/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd9658ff3653969507aa64a68baeacd3e5245c0e --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_156.txt @@ -0,0 +1,2 @@ +We strongly recommend checking that your template isn't printing extra +spaces where it shouldn't be before you upload it! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_16.txt b/chunked/nltk_chunking/_chat_templating/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..8149055d5befc2da458dec9fce5e912e2cce0a86 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_16.txt @@ -0,0 +1,2 @@ +If we use tokenize=True, which is the default setting, +that string will also be tokenized for us. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_17.txt b/chunked/nltk_chunking/_chat_templating/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..429f02f6a8a00e5143af313cea9dc8ced91af632 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_17.txt @@ -0,0 +1,2 @@ +To see a more complex template in action, though, let's use the +mistralai/Mistral-7B-Instruct-v0.1 model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_18.txt b/chunked/nltk_chunking/_chat_templating/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..0439c9eb5523d621d4b83e686d4781e1e4c0e026 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_18.txt @@ -0,0 +1,6 @@ +thon + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1") +chat = [ + {"role": "user", "content": "Hello, how are you? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_19.txt b/chunked/nltk_chunking/_chat_templating/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a714ecf0402d6285d6f898324fa24a816d1a3ad2 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_19.txt @@ -0,0 +1,2 @@ +"}, + {"role": "assistant", "content": "I'm doing great. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_2.txt b/chunked/nltk_chunking/_chat_templating/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf5aba294b329de93a9a4ebb13bb4b78a9556ca6 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_2.txt @@ -0,0 +1 @@ +Much like tokenization, different models expect very different input formats for chat. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_20.txt b/chunked/nltk_chunking/_chat_templating/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..33b79472da2a0e868de99b29c7f096532c666814 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_20.txt @@ -0,0 +1 @@ +How can I help you today? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_21.txt b/chunked/nltk_chunking/_chat_templating/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b5ea47a3706dc8dad93cee7e865fde2675ba84d --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_21.txt @@ -0,0 +1,2 @@ +"}, + {"role": "user", "content": "I'd like to show off how chat templating works! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_22.txt b/chunked/nltk_chunking/_chat_templating/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..54c3aaf1085a4a60da7644fe24bee4052d0811a8 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_22.txt @@ -0,0 +1,4 @@ +"}, + ] +tokenizer.apply_chat_template(chat, tokenize=False) +"[INST] Hello, how are you? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_23.txt b/chunked/nltk_chunking/_chat_templating/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..c18c8b229752a4d8953dc9c0d60ee0a8068fc23e --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_23.txt @@ -0,0 +1 @@ +[/INST]I'm doing great. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_24.txt b/chunked/nltk_chunking/_chat_templating/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..33b79472da2a0e868de99b29c7f096532c666814 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_24.txt @@ -0,0 +1 @@ +How can I help you today? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_25.txt b/chunked/nltk_chunking/_chat_templating/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..e24fd8d1f8666392db9f7136b5ada53e08ea5434 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_25.txt @@ -0,0 +1 @@ +[INST] I'd like to show off how chat templating works! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_26.txt b/chunked/nltk_chunking/_chat_templating/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..f84e1d49257f1c5d77662f866f32a6659c16f472 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_26.txt @@ -0,0 +1,4 @@ +[/INST]" + +Note that this time, the tokenizer has added the control tokens [INST] and [/INST] to indicate the start and end of +user messages (but not assistant messages!). \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_27.txt b/chunked/nltk_chunking/_chat_templating/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..fccf2d2a9ffdbe2a12c5279fcba1d0abbed11091 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_27.txt @@ -0,0 +1 @@ +Mistral-instruct was trained with these tokens, but BlenderBot was not. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_28.txt b/chunked/nltk_chunking/_chat_templating/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..a75fb631c0b1af990d7efba2eb46ec68d6bca57e --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_28.txt @@ -0,0 +1 @@ +How do I use chat templates? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_29.txt b/chunked/nltk_chunking/_chat_templating/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0e7071db6cbdbb5892d9e3cba4cb0352fa438a6 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_29.txt @@ -0,0 +1 @@ +As you can see in the example above, chat templates are easy to use. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_3.txt b/chunked/nltk_chunking/_chat_templating/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee0db0a247b8534e5b473b7041db42e552208388 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_3.txt @@ -0,0 +1,2 @@ +This is the reason we added +chat templates as a feature. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_30.txt b/chunked/nltk_chunking/_chat_templating/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..5bb2e9c85e130ff3c9c5ec48fc07d8b931030d99 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_30.txt @@ -0,0 +1,2 @@ +Simply build a list of messages, with role +and content keys, and then pass it to the [~PreTrainedTokenizer.apply_chat_template] method. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_31.txt b/chunked/nltk_chunking/_chat_templating/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..8858600166d30543a1a12bbb699dbb103b88f92a --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_31.txt @@ -0,0 +1,2 @@ +Once you do that, +you'll get output that's ready to go! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_32.txt b/chunked/nltk_chunking/_chat_templating/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8579b1684b5687c8fb5e252740201c8a706d014 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_32.txt @@ -0,0 +1,2 @@ +When using chat templates as input for model generation, it's also a good idea +to use add_generation_prompt=True to add a generation prompt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_33.txt b/chunked/nltk_chunking/_chat_templating/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..c85b949521f9e0ea7350f58b6e6359ac6ff96ef3 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_33.txt @@ -0,0 +1,12 @@ +Here's an example of preparing input for model.generate(), using the Zephyr assistant model: +thon +from transformers import AutoModelForCausalLM, AutoTokenizer +checkpoint = "HuggingFaceH4/zephyr-7b-beta" +tokenizer = AutoTokenizer.from_pretrained(checkpoint) +model = AutoModelForCausalLM.from_pretrained(checkpoint) # You may want to use bfloat16 and/or move to GPU here +messages = [ + { + "role": "system", + "content": "You are a friendly chatbot who always responds in the style of a pirate", + }, + {"role": "user", "content": "How many helicopters can a human eat in one sitting? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_34.txt b/chunked/nltk_chunking/_chat_templating/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..ddc1dadb3f42ca2d8d6aaaf839c376854d674ca9 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_34.txt @@ -0,0 +1,9 @@ +"}, + ] +tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt") +print(tokenizer.decode(tokenized_chat[0])) +This will yield a string in the input format that Zephyr expects.text +<|system|> +You are a friendly chatbot who always responds in the style of a pirate +<|user|> +How many helicopters can a human eat in one sitting? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_35.txt b/chunked/nltk_chunking/_chat_templating/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e14b65fc0bbfd7eb753306d0a740dbb7714dbec --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_35.txt @@ -0,0 +1,14 @@ +<|assistant|> + +Now that our input is formatted correctly for Zephyr, we can use the model to generate a response to the user's question: +python +outputs = model.generate(tokenized_chat, max_new_tokens=128) +print(tokenizer.decode(outputs[0])) +This will yield: +text +<|system|> +You are a friendly chatbot who always responds in the style of a pirate +<|user|> +How many helicopters can a human eat in one sitting? +<|assistant|> +Matey, I'm afraid I must inform ye that humans cannot eat helicopters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_36.txt b/chunked/nltk_chunking/_chat_templating/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffab1221649abbe24e09b8daaa823a4ed0cf5b52 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_36.txt @@ -0,0 +1 @@ +Helicopters are not food, they are flying machines. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_37.txt b/chunked/nltk_chunking/_chat_templating/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..06a4f1b11bb9c4b2c399a3a61d67c84d2b19af56 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_37.txt @@ -0,0 +1 @@ +Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_38.txt b/chunked/nltk_chunking/_chat_templating/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..e602cdae43a86c0499518c6e3d3db62ba82831fb --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_38.txt @@ -0,0 +1 @@ +But helicopters, they be for transportin' and movin' around, not for eatin'. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_39.txt b/chunked/nltk_chunking/_chat_templating/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..d38bf5ba7a803618ba581ebf465588a48c43315f --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_39.txt @@ -0,0 +1 @@ +So, I'd say none, me hearties. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_4.txt b/chunked/nltk_chunking/_chat_templating/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7fb86d84def515e33f0905eb2b2e8a1bd488d4a --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_4.txt @@ -0,0 +1 @@ +Chat templates are part of the tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_40.txt b/chunked/nltk_chunking/_chat_templating/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa85f5f76ba8ea5bbc9ac546bd4f0a61931d50f5 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_40.txt @@ -0,0 +1 @@ +None at all. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_41.txt b/chunked/nltk_chunking/_chat_templating/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce8286e29dd4f0f63dc32a73dfff959b5acc867c --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_41.txt @@ -0,0 +1 @@ +Arr, 'twas easy after all! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_42.txt b/chunked/nltk_chunking/_chat_templating/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..89fe91ed501407aab146ea9130b2168969be0460 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_42.txt @@ -0,0 +1 @@ +Is there an automated pipeline for chat? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_43.txt b/chunked/nltk_chunking/_chat_templating/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..90db396e947ebdfb4e89e3e3fbc5138e192cb0a9 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_43.txt @@ -0,0 +1 @@ +Yes, there is! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_44.txt b/chunked/nltk_chunking/_chat_templating/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..796808e9a5891cbb588680a7ead8f6f573941dd2 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_44.txt @@ -0,0 +1 @@ +Our text generation pipelines support chat inputs, which makes it easy to use chat models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_45.txt b/chunked/nltk_chunking/_chat_templating/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..27a1e3283c9355222aabf63209780a200a753872 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_45.txt @@ -0,0 +1,3 @@ +In the past, +we used to use a dedicated "ConversationalPipeline" class, but this has now been deprecated and its functionality +has been merged into the [TextGenerationPipeline]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_46.txt b/chunked/nltk_chunking/_chat_templating/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6714046aca403d8b35ddb18c6ef2d372b151cdf --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_46.txt @@ -0,0 +1,11 @@ +Let's try the Zephyr example again, but this time using +a pipeline: +thon +from transformers import pipeline +pipe = pipeline("text-generation", "HuggingFaceH4/zephyr-7b-beta") +messages = [ + { + "role": "system", + "content": "You are a friendly chatbot who always responds in the style of a pirate", + }, + {"role": "user", "content": "How many helicopters can a human eat in one sitting? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_47.txt b/chunked/nltk_chunking/_chat_templating/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b4862b0989c2aa57b54bed5b8dcb018c005a4ab --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_47.txt @@ -0,0 +1,6 @@ +"}, +] +print(pipe(messages, max_new_tokens=128)[0]['generated_text'][-1]) # Print the assistant's response + +text +{'role': 'assistant', 'content': "Matey, I'm afraid I must inform ye that humans cannot eat helicopters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_48.txt b/chunked/nltk_chunking/_chat_templating/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffab1221649abbe24e09b8daaa823a4ed0cf5b52 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_48.txt @@ -0,0 +1 @@ +Helicopters are not food, they are flying machines. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_49.txt b/chunked/nltk_chunking/_chat_templating/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..06a4f1b11bb9c4b2c399a3a61d67c84d2b19af56 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_49.txt @@ -0,0 +1 @@ +Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_5.txt b/chunked/nltk_chunking/_chat_templating/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..91843595f1ebfe2bf309aa093c24404bcd176928 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_5.txt @@ -0,0 +1,2 @@ +They specify how to convert conversations, +represented as lists of messages, into a single tokenizable string in the format that the model expects. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_50.txt b/chunked/nltk_chunking/_chat_templating/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..e602cdae43a86c0499518c6e3d3db62ba82831fb --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_50.txt @@ -0,0 +1 @@ +But helicopters, they be for transportin' and movin' around, not for eatin'. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_51.txt b/chunked/nltk_chunking/_chat_templating/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..d38bf5ba7a803618ba581ebf465588a48c43315f --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_51.txt @@ -0,0 +1 @@ +So, I'd say none, me hearties. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_52.txt b/chunked/nltk_chunking/_chat_templating/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..8daff91ba4d643033f8733f5069d8906ef908df7 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_52.txt @@ -0,0 +1 @@ +None at all."} \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_53.txt b/chunked/nltk_chunking/_chat_templating/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f09aa40ba87a00d38adbdabbb968951c3f545bb --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_53.txt @@ -0,0 +1,2 @@ +The pipeline will take care of all the details of tokenization and calling apply_chat_template for you - +once the model has a chat template, all you need to do is initialize the pipeline and pass it the list of messages! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_54.txt b/chunked/nltk_chunking/_chat_templating/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e3adaa28e76b766b12092523c494e603514a363 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_54.txt @@ -0,0 +1 @@ +What are "generation prompts"? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_55.txt b/chunked/nltk_chunking/_chat_templating/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..590d460c004d81b30744ea4d2a5d54fb952e2a8a --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_55.txt @@ -0,0 +1 @@ +You may have noticed that the apply_chat_template method has an add_generation_prompt argument. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_56.txt b/chunked/nltk_chunking/_chat_templating/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..90b8425e63da9911be571ded91ee767875a4e71d --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_56.txt @@ -0,0 +1,2 @@ +This argument tells +the template to add tokens that indicate the start of a bot response. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_57.txt b/chunked/nltk_chunking/_chat_templating/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..84db913ad5e2d11992154a041091dfe022f52925 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_57.txt @@ -0,0 +1,4 @@ +For example, consider the following chat: +python +messages = [ + {"role": "user", "content": "Hi there! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_58.txt b/chunked/nltk_chunking/_chat_templating/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6e756aea254b9d47194659e23e25058abd0b01a --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_58.txt @@ -0,0 +1,2 @@ +"}, + {"role": "assistant", "content": "Nice to meet you! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_59.txt b/chunked/nltk_chunking/_chat_templating/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..76d1442ca7b02834570732b413eef0af6e820471 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_59.txt @@ -0,0 +1,2 @@ +"}, + {"role": "user", "content": "Can I ask a question?"} \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_6.txt b/chunked/nltk_chunking/_chat_templating/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..d657a5b84176d927abc6d8163f62ee064f0913d4 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_6.txt @@ -0,0 +1 @@ +Let's make this concrete with a quick example using the BlenderBot model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_60.txt b/chunked/nltk_chunking/_chat_templating/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..26e74ef3343fbe911753a79de8e76e1f42e4091f --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_60.txt @@ -0,0 +1,23 @@ +] +Here's what this will look like without a generation prompt, using the ChatML template we saw in the Zephyr example: +python +tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False) +"""<|im_start|>user +Hi there!<|im_end|> +<|im_start|>assistant +Nice to meet you!<|im_end|> +<|im_start|>user +Can I ask a question?<|im_end|> +""" +And here's what it looks like with a generation prompt: +python +tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) +"""<|im_start|>user +Hi there!<|im_end|> +<|im_start|>assistant +Nice to meet you!<|im_end|> +<|im_start|>user +Can I ask a question?<|im_end|> +<|im_start|>assistant +""" +Note that this time, we've added the tokens that indicate the start of a bot response. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_61.txt b/chunked/nltk_chunking/_chat_templating/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..5005eb224a77708e27b60cee4d7e03be3b8ca85f --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_61.txt @@ -0,0 +1,3 @@ +This ensures that when the model +generates text it will write a bot response instead of doing something unexpected, like continuing the user's +message. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_62.txt b/chunked/nltk_chunking/_chat_templating/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..961e6a4670bf93ca66baf7c1836572ddb2bd3d8d --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_62.txt @@ -0,0 +1,2 @@ +Remember, chat models are still just language models - they're trained to continue text, and chat is just a +special kind of text to them! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_63.txt b/chunked/nltk_chunking/_chat_templating/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..500d0ba79609f90e3df5867ec7f39cfbb868471c --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_63.txt @@ -0,0 +1,2 @@ +You need to guide them with appropriate control tokens, so they know what they're +supposed to be doing. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_64.txt b/chunked/nltk_chunking/_chat_templating/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..307a67363fc9176cf9fc6d9cbb68c0d01581b453 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_64.txt @@ -0,0 +1 @@ +Not all models require generation prompts. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_65.txt b/chunked/nltk_chunking/_chat_templating/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2aa7825c0fd06035d16103a4797c75f4bb89013 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_65.txt @@ -0,0 +1,2 @@ +Some models, like BlenderBot and LLaMA, don't have any +special tokens before bot responses. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_66.txt b/chunked/nltk_chunking/_chat_templating/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..3dbcdd6d8d9c50e3d3e8b34cdee00a4e4ab7977b --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_66.txt @@ -0,0 +1 @@ +In these cases, the add_generation_prompt argument will have no effect. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_67.txt b/chunked/nltk_chunking/_chat_templating/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..da18f3dbe928caefb2580c7be0b4018f52441e1f --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_67.txt @@ -0,0 +1,2 @@ +The exact +effect that add_generation_prompt has will depend on the template being used. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_68.txt b/chunked/nltk_chunking/_chat_templating/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b2082f5680ac8ceb3abdc241497434e787c318a --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_68.txt @@ -0,0 +1 @@ +Can I use chat templates in training? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_69.txt b/chunked/nltk_chunking/_chat_templating/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..399dfe6476443bd5a6d0e98542fc70210e8df718 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_69.txt @@ -0,0 +1 @@ +Yes! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_7.txt b/chunked/nltk_chunking/_chat_templating/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..389abe4adaf273fb3bc06302bd45bb629cf3301f --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_7.txt @@ -0,0 +1,8 @@ +BlenderBot has an extremely simple default +template, which mostly just adds whitespace between rounds of dialogue: +thon + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") +chat = [ + {"role": "user", "content": "Hello, how are you? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_70.txt b/chunked/nltk_chunking/_chat_templating/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..fba2a123f3359ff190fc37b92493890acdf89f0b --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_70.txt @@ -0,0 +1 @@ +We recommend that you apply the chat template as a preprocessing step for your dataset. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_71.txt b/chunked/nltk_chunking/_chat_templating/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..2832939c0278586ee160c5f0331cdad18677b587 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_71.txt @@ -0,0 +1,2 @@ +After this, you +can simply continue like any other language model training task. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_72.txt b/chunked/nltk_chunking/_chat_templating/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..da9ada9d0ca014e199daebf48e5256300962eb57 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_72.txt @@ -0,0 +1,3 @@ +When training, you should usually set +add_generation_prompt=False, because the added tokens to prompt an assistant response will not be helpful during +training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_73.txt b/chunked/nltk_chunking/_chat_templating/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..7394e69d0fe7e6059c01fe925de3c5eac8d7cb51 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_73.txt @@ -0,0 +1,7 @@ +Let's see an example: +thon +from transformers import AutoTokenizer +from datasets import Dataset +tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta") +chat1 = [ + {"role": "user", "content": "Which is bigger, the moon or the sun? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_74.txt b/chunked/nltk_chunking/_chat_templating/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfda4813f3768fb1c74af2013e2550bbf8aba050 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_74.txt @@ -0,0 +1,2 @@ +"}, + {"role": "assistant", "content": "The sun."} \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_75.txt b/chunked/nltk_chunking/_chat_templating/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..89e83212e5769dc3f8f93082f678271264a8640a --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_75.txt @@ -0,0 +1,3 @@ +] +chat2 = [ + {"role": "user", "content": "Which is bigger, a virus or a bacterium? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_76.txt b/chunked/nltk_chunking/_chat_templating/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0dcd46f5df10d905f5478bba68a106484e2f5db --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_76.txt @@ -0,0 +1,2 @@ +"}, + {"role": "assistant", "content": "A bacterium."} \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_77.txt b/chunked/nltk_chunking/_chat_templating/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..00ef546f1a9d9e213c13dea189e5a8c36fe4a036 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_77.txt @@ -0,0 +1,7 @@ +] +dataset = Dataset.from_dict({"chat": [chat1, chat2]}) +dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)}) +print(dataset['formatted_chat'][0]) +And we get:text +<|user|> +Which is bigger, the moon or the sun? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_78.txt b/chunked/nltk_chunking/_chat_templating/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..44ab0ca2367f6b23a01f7bf47b9e4796017cd7ee --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_78.txt @@ -0,0 +1,2 @@ +<|assistant|> +The sun. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_79.txt b/chunked/nltk_chunking/_chat_templating/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..578b68ae4bbeeef80895fbf249d3f64b65f7cd3c --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_79.txt @@ -0,0 +1 @@ +From here, just continue training like you would with a standard language modelling task, using the formatted_chat column. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_8.txt b/chunked/nltk_chunking/_chat_templating/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..15f89b7288a9dd7ae2e186675c36533ece8419e2 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_8.txt @@ -0,0 +1,2 @@ +"}, + {"role": "assistant", "content": "I'm doing great. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_80.txt b/chunked/nltk_chunking/_chat_templating/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..90967aa88caf2b1ca0af64ada72dfdb00f440f85 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_80.txt @@ -0,0 +1 @@ +Advanced: How do chat templates work? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_81.txt b/chunked/nltk_chunking/_chat_templating/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..2acdab02481345341d06425610256be443c8e038 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_81.txt @@ -0,0 +1 @@ +The chat template for a model is stored on the tokenizer.chat_template attribute. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_82.txt b/chunked/nltk_chunking/_chat_templating/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..644a1df805c9cb58e764c915439c140827c51adc --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_82.txt @@ -0,0 +1,2 @@ +If no chat template is set, the +default template for that model class is used instead. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_83.txt b/chunked/nltk_chunking/_chat_templating/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..49f00211de87779c6f2d45b270532d2b33366c81 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_83.txt @@ -0,0 +1,9 @@ +Let's take a look at the template for BlenderBot: +thon + +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") +tokenizer.default_chat_template +"{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ ' ' }}{% endif %}{% endfor %}{{ eos_token }}" + +That's kind of intimidating. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_84.txt b/chunked/nltk_chunking/_chat_templating/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..949a984b2c071856bb508e32f0d572386170d6cf --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_84.txt @@ -0,0 +1 @@ +Let's add some newlines and indentation to make it more readable. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_85.txt b/chunked/nltk_chunking/_chat_templating/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..3261619af04ce65ac58f294e92593e77b9bec0a0 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_85.txt @@ -0,0 +1,3 @@ +Note that the first +newline after each block as well as any preceding whitespace before a block are ignored by default, using the +Jinja trim_blocks and lstrip_blocks flags. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_86.txt b/chunked/nltk_chunking/_chat_templating/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..caa700a9661bec10ca11ab27583b0022fac22f9b --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_86.txt @@ -0,0 +1,2 @@ +However, be cautious - although leading whitespace on each +line is stripped, spaces between blocks on the same line are not. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_87.txt b/chunked/nltk_chunking/_chat_templating/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..772932220adbde5ec0b667345975633ec9db9d13 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_87.txt @@ -0,0 +1,2 @@ +We strongly recommend checking that your template +isn't printing extra spaces where it shouldn't be! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_88.txt b/chunked/nltk_chunking/_chat_templating/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..9464931e14b864b6ad7268e7c370b467e97af929 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_88.txt @@ -0,0 +1,11 @@ +{% for message in messages %} + {% if message['role'] == 'user' %} + {{ ' ' }} + {% endif %} + {{ message['content'] }} + {% if not loop.last %} + {{ ' ' }} + {% endif %} +{% endfor %} +{{ eos_token }} +If you've never seen one of these before, this is a Jinja template. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_89.txt b/chunked/nltk_chunking/_chat_templating/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..af887cda9fcb262fcc55349848520efa33fbb1d1 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_89.txt @@ -0,0 +1 @@ +Jinja is a templating language that allows you to write simple code that generates text. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_9.txt b/chunked/nltk_chunking/_chat_templating/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..33b79472da2a0e868de99b29c7f096532c666814 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_9.txt @@ -0,0 +1 @@ +How can I help you today? \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_90.txt b/chunked/nltk_chunking/_chat_templating/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..88c981f712df42cbd12e7bd543a1f90d52a31df3 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_90.txt @@ -0,0 +1,2 @@ +In many ways, the code and +syntax resembles Python. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_91.txt b/chunked/nltk_chunking/_chat_templating/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..a01b7d07a51234d67987087d4013fb8150ff6d14 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_91.txt @@ -0,0 +1,11 @@ +In pure Python, this template would look something like this: +python +for idx, message in enumerate(messages): + if message['role'] == 'user': + print(' ') + print(message['content']) + if not idx == len(messages) - 1: # Check for the last message in the conversation + print(' ') +print(eos_token) +Effectively, the template does three things: +1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_92.txt b/chunked/nltk_chunking/_chat_templating/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd82be2c1666f3cffa9118de995d4499eed71031 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_92.txt @@ -0,0 +1 @@ +For each message, if the message is a user message, add a blank space before it, otherwise print nothing. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_93.txt b/chunked/nltk_chunking/_chat_templating/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_93.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_94.txt b/chunked/nltk_chunking/_chat_templating/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..6980645b0697e7106d1f30085bb1d6a4ff61e5e3 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_94.txt @@ -0,0 +1,2 @@ +Add the message content +3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_95.txt b/chunked/nltk_chunking/_chat_templating/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4635bf384c72e222015a033c91678d08039e5c2 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_95.txt @@ -0,0 +1 @@ +If the message is not the last message, add two spaces after it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_96.txt b/chunked/nltk_chunking/_chat_templating/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a35647a927803d6ebe52af1bbf621af1a132806 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_96.txt @@ -0,0 +1 @@ +After the final message, print the EOS token. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_97.txt b/chunked/nltk_chunking/_chat_templating/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecef6ad53f922589b0a096f2fb0a69e8e64d5132 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_97.txt @@ -0,0 +1,2 @@ +This is a pretty simple template - it doesn't add any control tokens, and it doesn't support "system" messages, which +are a common way to give the model directives about how it should behave in the subsequent conversation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_98.txt b/chunked/nltk_chunking/_chat_templating/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ab7778cc2bdbd9f2187e98af984541cbfec019e --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_98.txt @@ -0,0 +1 @@ +But Jinja gives you a lot of flexibility to do those things! \ No newline at end of file diff --git a/chunked/nltk_chunking/_chat_templating/chunk_99.txt b/chunked/nltk_chunking/_chat_templating/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..71b0e71651c1c21800b779439643cbefc21a2b37 --- /dev/null +++ b/chunked/nltk_chunking/_chat_templating/chunk_99.txt @@ -0,0 +1,3 @@ +Let's see a Jinja template that can format inputs +similarly to the way LLaMA formats them (note that the real LLaMA template includes handling for default system +messages and slightly different system message handling in general - don't use this one in your actual code!) \ No newline at end of file diff --git a/chunked/nltk_chunking/_community/chunk_0.txt b/chunked/nltk_chunking/_community/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6ddfde41ee4655fedc2f936d07e02e97865647b --- /dev/null +++ b/chunked/nltk_chunking/_community/chunk_0.txt @@ -0,0 +1,3 @@ + +Community +This page regroups resources around 🤗 Transformers developed by the community. \ No newline at end of file diff --git a/chunked/nltk_chunking/_community/chunk_1.txt b/chunked/nltk_chunking/_community/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..120670afe0443cfadd809b607cf1cf36c26d75e4 --- /dev/null +++ b/chunked/nltk_chunking/_community/chunk_1.txt @@ -0,0 +1,4 @@ +Community resources: +| Resource | Description | Author | +|:----------|:-------------|------:| +| Hugging Face Transformers Glossary Flashcards | A set of flashcards based on the Transformers Docs Glossary that has been put into a form which can be easily learned/revised using Anki an open source, cross platform app specifically designed for long term knowledge retention. \ No newline at end of file diff --git a/chunked/nltk_chunking/_community/chunk_2.txt b/chunked/nltk_chunking/_community/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c100d40d02de4a458cd8c6084be761402f24916 --- /dev/null +++ b/chunked/nltk_chunking/_community/chunk_2.txt @@ -0,0 +1 @@ +See this Introductory video on how to use the flashcards. \ No newline at end of file diff --git a/chunked/nltk_chunking/_community/chunk_3.txt b/chunked/nltk_chunking/_community/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc6f983c268cf71876fe515e4c43173ea99284ea --- /dev/null +++ b/chunked/nltk_chunking/_community/chunk_3.txt @@ -0,0 +1,6 @@ +| Darigov Research | +Community notebooks: +| Notebook | Description | Author | | +|:----------|:-------------|:-------------|------:| +| Fine-tune a pre-trained Transformer to generate lyrics | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model | Aleksey Korshuk | | +| Train T5 in Tensorflow 2 | How to train T5 for any task using Tensorflow 2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_community/chunk_4.txt b/chunked/nltk_chunking/_community/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..8399f3a7c130767345b24d9608d3fc3a09cb5c23 --- /dev/null +++ b/chunked/nltk_chunking/_community/chunk_4.txt @@ -0,0 +1,18 @@ +This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | Muhammad Harris | | +| Train T5 on TPU | How to train T5 on SQUAD with Transformers and Nlp | Suraj Patil | | +| Fine-tune T5 for Classification and Multiple Choice | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning | Suraj Patil | | +| Fine-tune DialoGPT on New Datasets and Languages | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots | Nathan Cooper | | +| Long Sequence Modeling with Reformer | How to train on sequences as long as 500,000 tokens with Reformer | Patrick von Platen | | +| Fine-tune BART for Summarization | How to fine-tune BART for summarization with fastai using blurr | Wayde Gilliam | | +| Fine-tune a pre-trained Transformer on anyone's tweets | How to generate tweets in the style of your favorite Twitter account by fine-tuning a GPT-2 model | Boris Dayma | | +| Optimize 🤗 Hugging Face models with Weights & Biases | A complete tutorial showcasing W&B integration with Hugging Face | Boris Dayma | | +| Pretrain Longformer | How to build a "long" version of existing pretrained models | Iz Beltagy | | +| Fine-tune Longformer for QA | How to fine-tune longformer model for QA task | Suraj Patil | | +| Evaluate Model with 🤗nlp | How to evaluate longformer on TriviaQA with nlp | Patrick von Platen | | +| Fine-tune T5 for Sentiment Span Extraction | How to fine-tune T5 for sentiment span extraction using a text-to-text format with PyTorch Lightning | Lorenzo Ampil | | +| Fine-tune DistilBert for Multiclass Classification | How to fine-tune DistilBert for multiclass classification with PyTorch | Abhishek Kumar Mishra | | +|Fine-tune BERT for Multi-label Classification|How to fine-tune BERT for multi-label classification using PyTorch|Abhishek Kumar Mishra || +|Fine-tune T5 for Summarization|How to fine-tune T5 for summarization in PyTorch and track experiments with WandB|Abhishek Kumar Mishra || +|Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|Michael Benesty || +|Pretrain Reformer for Masked Language Modeling| How to train a Reformer model with bi-directional self-attention layers | Patrick von Platen | | +|Expand and Fine Tune Sci-BERT| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_community/chunk_5.txt b/chunked/nltk_chunking/_community/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..5db1570ff2f62431fc721d22960f499a07ccc374 --- /dev/null +++ b/chunked/nltk_chunking/_community/chunk_5.txt @@ -0,0 +1,2 @@ +| Tanmay Thakur | | +|Fine Tune BlenderBotSmall for Summarization using the Trainer API| How to fine-tune BlenderBotSmall for summarization on a custom dataset, using the Trainer API. \ No newline at end of file diff --git a/chunked/nltk_chunking/_community/chunk_6.txt b/chunked/nltk_chunking/_community/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca022ed3f970b7c3aad1f1f9a68d33893f8df43e --- /dev/null +++ b/chunked/nltk_chunking/_community/chunk_6.txt @@ -0,0 +1,7 @@ +| Tanmay Thakur | | +|Fine-tune Electra and interpret with Integrated Gradients | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | Eliza Szczechla | | +|fine-tune a non-English GPT-2 Model with Trainer class | How to fine-tune a non-English GPT-2 Model with Trainer class | Philipp Schmid | | +|Fine-tune a DistilBERT Model for Multi Label Classification task | How to fine-tune a DistilBERT Model for Multi Label Classification task | Dhaval Taunk | | +|Fine-tune ALBERT for sentence-pair classification | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | Nadir El Manouzi | | +|Fine-tune Roberta for sentiment analysis | How to fine-tune a Roberta model for sentiment analysis | Dhaval Taunk | | +|Evaluating Question Generation Models | How accurate are the answers to questions generated by your seq2seq transformer model? \ No newline at end of file diff --git a/chunked/nltk_chunking/_community/chunk_7.txt b/chunked/nltk_chunking/_community/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..51af90901471fd8aae1c3931e0b80460d2130edd --- /dev/null +++ b/chunked/nltk_chunking/_community/chunk_7.txt @@ -0,0 +1,26 @@ +| Pascal Zoleko | | +|Classify text with DistilBERT and Tensorflow | How to fine-tune DistilBERT for text classification in TensorFlow | Peter Bayerle | | +|Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail | How to warm-start a EncoderDecoderModel with a google-bert/bert-base-uncased checkpoint for summarization on CNN/Dailymail | Patrick von Platen | | +|Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum | How to warm-start a shared EncoderDecoderModel with a FacebookAI/roberta-base checkpoint for summarization on BBC/XSum | Patrick von Platen | | +|Fine-tune TAPAS on Sequential Question Answering (SQA) | How to fine-tune TapasForQuestionAnswering with a tapas-base checkpoint on the Sequential Question Answering (SQA) dataset | Niels Rogge | | +|Evaluate TAPAS on Table Fact Checking (TabFact) | How to evaluate a fine-tuned TapasForSequenceClassification with a tapas-base-finetuned-tabfact checkpoint using a combination of the 🤗 datasets and 🤗 transformers libraries | Niels Rogge | | +|Fine-tuning mBART for translation | How to fine-tune mBART using Seq2SeqTrainer for Hindi to English translation | Vasudev Gupta | | +|Fine-tune LayoutLM on FUNSD (a form understanding dataset) | How to fine-tune LayoutLMForTokenClassification on the FUNSD dataset for information extraction from scanned documents | Niels Rogge | | +|Fine-Tune DistilGPT2 and Generate Text | How to fine-tune DistilGPT2 and generate text | Aakash Tripathi | | +|Fine-Tune LED on up to 8K tokens | How to fine-tune LED on pubmed for long-range summarization | Patrick von Platen | | +|Evaluate LED on Arxiv | How to effectively evaluate LED on long-range summarization | Patrick von Platen | | +|Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset) | How to fine-tune LayoutLMForSequenceClassification on the RVL-CDIP dataset for scanned document classification | Niels Rogge | | +|Wav2Vec2 CTC decoding with GPT2 adjustment | How to decode CTC sequence with language model adjustment | Eric Lam | | +|Fine-tune BART for summarization in two languages with Trainer class | How to fine-tune BART for summarization in two languages with Trainer class | Eliza Szczechla | | +|Evaluate Big Bird on Trivia QA | How to evaluate BigBird on long document question answering on Trivia QA | Patrick von Platen | | +| Create video captions using Wav2Vec2 | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | Niklas Muennighoff | | +| Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and PyTorch Lightning | Niels Rogge | | +| Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and the 🤗 Trainer | Niels Rogge | | +| Evaluate LUKE on Open Entity, an entity typing dataset | How to evaluate LukeForEntityClassification on the Open Entity dataset | Ikuya Yamada | | +| Evaluate LUKE on TACRED, a relation extraction dataset | How to evaluate LukeForEntityPairClassification on the TACRED dataset | Ikuya Yamada | | +| Evaluate LUKE on CoNLL-2003, an important NER benchmark | How to evaluate LukeForEntitySpanClassification on the CoNLL-2003 dataset | Ikuya Yamada | | +| Evaluate BigBird-Pegasus on PubMed dataset | How to evaluate BigBirdPegasusForConditionalGeneration on PubMed dataset | Vasudev Gupta | | +| Speech Emotion Classification with Wav2Vec2 | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | Mehrdad Farahani | | +| Detect objects in an image with DETR | How to use a trained DetrForObjectDetection model to detect objects in an image and visualize attention | Niels Rogge | | +| Fine-tune DETR on a custom object detection dataset | How to fine-tune DetrForObjectDetection on a custom object detection dataset | Niels Rogge | | +| Finetune T5 for Named Entity Recognition | How to fine-tune T5 on a Named Entity Recognition Task | Ogundepo Odunayo | | \ No newline at end of file diff --git a/chunked/nltk_chunking/_contributing/chunk_0.txt b/chunked/nltk_chunking/_contributing/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..c97564d93a7f0a753a23cd97d2467d595bd154ff --- /dev/null +++ b/chunked/nltk_chunking/_contributing/chunk_0.txt @@ -0,0 +1 @@ +../../../CONTRIBUTING.md \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_0.txt b/chunked/nltk_chunking/_create_a_model/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..3aeff76d6b4280523266fa54687fab010d7f1486 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_0.txt @@ -0,0 +1,3 @@ + +Create a custom architecture +An AutoClass automatically infers the model architecture and downloads pretrained configuration and weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_1.txt b/chunked/nltk_chunking/_create_a_model/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2b2e7bb16ae8d8c1fff18c88429094d3e799631 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_1.txt @@ -0,0 +1 @@ +Generally, we recommend using an AutoClass to produce checkpoint-agnostic code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_10.txt b/chunked/nltk_chunking/_create_a_model/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fadd6aeff2f66b7566f872c347ba5c302fa764c --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_10.txt @@ -0,0 +1 @@ +Create a processor for multimodal tasks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_11.txt b/chunked/nltk_chunking/_create_a_model/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..50bb8afbaa04894a4328438ecd216d65f684f8e5 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_11.txt @@ -0,0 +1,2 @@ +Configuration +A configuration refers to a model's specific attributes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_12.txt b/chunked/nltk_chunking/_create_a_model/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..88f09011a41e6570e533e4220ded90b4be35726b --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_12.txt @@ -0,0 +1 @@ +Each model configuration has different attributes; for instance, all NLP models have the hidden_size, num_attention_heads, num_hidden_layers and vocab_size attributes in common. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_13.txt b/chunked/nltk_chunking/_create_a_model/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d9010af0212a0e096488f57691290917558e356 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_13.txt @@ -0,0 +1 @@ +These attributes specify the number of attention heads or hidden layers to construct a model with. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_14.txt b/chunked/nltk_chunking/_create_a_model/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..970d93405b107ea1b558ed26379aa48590483ba5 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_14.txt @@ -0,0 +1,25 @@ +Get a closer look at DistilBERT by accessing [DistilBertConfig] to inspect it's attributes: + +from transformers import DistilBertConfig +config = DistilBertConfig() +print(config) +DistilBertConfig { + "activation": "gelu", + "attention_dropout": 0.1, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "model_type": "distilbert", + "n_heads": 12, + "n_layers": 6, + "pad_token_id": 0, + "qa_dropout": 0.1, + "seq_classif_dropout": 0.2, + "sinusoidal_pos_embds": false, + "transformers_version": "4.16.2", + "vocab_size": 30522 +} + +[DistilBertConfig] displays all the default attributes used to build a base [DistilBertModel]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_15.txt b/chunked/nltk_chunking/_create_a_model/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2e6bd45bebd828c1048f6bf7b7e86904f75701f --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_15.txt @@ -0,0 +1 @@ +All attributes are customizable, creating space for experimentation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_16.txt b/chunked/nltk_chunking/_create_a_model/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b7b53417427bb76a4efe10520a322222ef24412 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_16.txt @@ -0,0 +1,3 @@ +For example, you can customize a default model to: + +Try a different activation function with the activation parameter. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_17.txt b/chunked/nltk_chunking/_create_a_model/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..163dfbe12e177375b37fa3eb0399514fa60dac19 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_17.txt @@ -0,0 +1 @@ +Use a higher dropout ratio for the attention probabilities with the attention_dropout parameter. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_18.txt b/chunked/nltk_chunking/_create_a_model/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8ca52d661712038ee5dc3a4b8d51f4aeba31727 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_18.txt @@ -0,0 +1,26 @@ +my_config = DistilBertConfig(activation="relu", attention_dropout=0.4) +print(my_config) +DistilBertConfig { + "activation": "relu", + "attention_dropout": 0.4, + "dim": 768, + "dropout": 0.1, + "hidden_dim": 3072, + "initializer_range": 0.02, + "max_position_embeddings": 512, + "model_type": "distilbert", + "n_heads": 12, + "n_layers": 6, + "pad_token_id": 0, + "qa_dropout": 0.1, + "seq_classif_dropout": 0.2, + "sinusoidal_pos_embds": false, + "transformers_version": "4.16.2", + "vocab_size": 30522 +} + +Pretrained model attributes can be modified in the [~PretrainedConfig.from_pretrained] function: + +my_config = DistilBertConfig.from_pretrained("distilbert/distilbert-base-uncased", activation="relu", attention_dropout=0.4) + +Once you are satisfied with your model configuration, you can save it with [~PretrainedConfig.save_pretrained]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_19.txt b/chunked/nltk_chunking/_create_a_model/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..28f1e14e947688a47d3121ce9817b4069372b1e7 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_19.txt @@ -0,0 +1,9 @@ +Your configuration file is stored as a JSON file in the specified save directory: + +my_config.save_pretrained(save_directory="./your_model_save_path") + +To reuse the configuration file, load it with [~PretrainedConfig.from_pretrained]: + +my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") + +You can also save your configuration file as a dictionary or even just the difference between your custom configuration attributes and the default configuration attributes! \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_2.txt b/chunked/nltk_chunking/_create_a_model/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..d455be01c4399041e8863f777cab6e91934c5501 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_2.txt @@ -0,0 +1 @@ +But users who want more control over specific model parameters can create a custom 🤗 Transformers model from just a few base classes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_20.txt b/chunked/nltk_chunking/_create_a_model/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..f08fc26bb0175a7d376bfff6262cba271e79d9ab --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_20.txt @@ -0,0 +1 @@ +See the configuration documentation for more details. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_21.txt b/chunked/nltk_chunking/_create_a_model/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3cb353185fd83edab83dab721069a4dffea414c --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_21.txt @@ -0,0 +1,2 @@ +Model +The next step is to create a model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_22.txt b/chunked/nltk_chunking/_create_a_model/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec13f498a8bfb4de5d853edd34ea354d49219f40 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_22.txt @@ -0,0 +1 @@ +The model - also loosely referred to as the architecture - defines what each layer is doing and what operations are happening. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_23.txt b/chunked/nltk_chunking/_create_a_model/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..c58db290ccd1602c835505078eec5da945fae52b --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_23.txt @@ -0,0 +1 @@ +Attributes like num_hidden_layers from the configuration are used to define the architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_24.txt b/chunked/nltk_chunking/_create_a_model/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..414f1f7b8ddfcb966bbb2b5360e1b71359838b70 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_24.txt @@ -0,0 +1 @@ +Every model shares the base class [PreTrainedModel] and a few common methods like resizing input embeddings and pruning self-attention heads. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_25.txt b/chunked/nltk_chunking/_create_a_model/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8f43aadfef06a5fec5bac258ab35dd6ec09bc48 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_25.txt @@ -0,0 +1 @@ +In addition, all models are also either a torch.nn.Module, tf.keras.Model or flax.linen.Module subclass. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_26.txt b/chunked/nltk_chunking/_create_a_model/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..281ef9c8b64093ffda0f9dacbbf6bcfaafae8a79 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_26.txt @@ -0,0 +1 @@ +This means models are compatible with each of their respective framework's usage. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_27.txt b/chunked/nltk_chunking/_create_a_model/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..06d2198134d91fd37d7817bb56e388e26ac3c9b5 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_27.txt @@ -0,0 +1,7 @@ +Load your custom configuration attributes into the model: + +from transformers import DistilBertModel +my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json") +model = DistilBertModel(my_config) + +This creates a model with random values instead of pretrained weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_28.txt b/chunked/nltk_chunking/_create_a_model/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c5d09a58092631a2cbfae8370338f5202e86997 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_28.txt @@ -0,0 +1 @@ +You won't be able to use this model for anything useful yet until you train it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_29.txt b/chunked/nltk_chunking/_create_a_model/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..696098edf27b9596257466d073b18cd3ba73ae6b --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_29.txt @@ -0,0 +1 @@ +Training is a costly and time-consuming process. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_3.txt b/chunked/nltk_chunking/_create_a_model/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..d42cb817d1ec404bdb377a77511829e2276c1415 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_3.txt @@ -0,0 +1 @@ +This could be particularly useful for anyone who is interested in studying, training or experimenting with a 🤗 Transformers model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_30.txt b/chunked/nltk_chunking/_create_a_model/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f05b51fde0f72af9ff270c173e9756723aadfea --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_30.txt @@ -0,0 +1 @@ +It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_31.txt b/chunked/nltk_chunking/_create_a_model/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..c435f29a2400ad78fdb9c6c5a409c4f82fe3231e --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_31.txt @@ -0,0 +1,5 @@ +Create a pretrained model with [~PreTrainedModel.from_pretrained]: + +model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased") + +When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_32.txt b/chunked/nltk_chunking/_create_a_model/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcd661141a974014c1e53ed963e20642c9243952 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_32.txt @@ -0,0 +1,11 @@ +However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like: + +model = DistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config) + +Load your custom configuration attributes into the model: + +from transformers import TFDistilBertModel +my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json") +tf_model = TFDistilBertModel(my_config) + +This creates a model with random values instead of pretrained weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_33.txt b/chunked/nltk_chunking/_create_a_model/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c5d09a58092631a2cbfae8370338f5202e86997 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_33.txt @@ -0,0 +1 @@ +You won't be able to use this model for anything useful yet until you train it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_34.txt b/chunked/nltk_chunking/_create_a_model/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..696098edf27b9596257466d073b18cd3ba73ae6b --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_34.txt @@ -0,0 +1 @@ +Training is a costly and time-consuming process. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_35.txt b/chunked/nltk_chunking/_create_a_model/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f05b51fde0f72af9ff270c173e9756723aadfea --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_35.txt @@ -0,0 +1 @@ +It is generally better to use a pretrained model to obtain better results faster, while using only a fraction of the resources required for training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_36.txt b/chunked/nltk_chunking/_create_a_model/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d8eaafc1a51c13463db009fa0ce705775d472d7 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_36.txt @@ -0,0 +1,5 @@ +Create a pretrained model with [~TFPreTrainedModel.from_pretrained]: + +tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased") + +When you load pretrained weights, the default model configuration is automatically loaded if the model is provided by 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_37.txt b/chunked/nltk_chunking/_create_a_model/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..532a0e80d60a3daa6f66c9875d26062142cc77a6 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_37.txt @@ -0,0 +1,6 @@ +However, you can still replace - some or all of - the default model configuration attributes with your own if you'd like: + +tf_model = TFDistilBertModel.from_pretrained("distilbert/distilbert-base-uncased", config=my_config) + +Model heads +At this point, you have a base DistilBERT model which outputs the hidden states. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_38.txt b/chunked/nltk_chunking/_create_a_model/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..97fbd185e51262656bcd575096d63ff7264a0bfd --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_38.txt @@ -0,0 +1 @@ +The hidden states are passed as inputs to a model head to produce the final output. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_39.txt b/chunked/nltk_chunking/_create_a_model/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e552f822c670763c295ba6bc6d0d67a7dc7702a --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_39.txt @@ -0,0 +1 @@ +🤗 Transformers provides a different model head for each task as long as a model supports the task (i.e., you can't use DistilBERT for a sequence-to-sequence task like translation). \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_4.txt b/chunked/nltk_chunking/_create_a_model/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7efd75b6d90c901bad4ed5a9747b29759cf05fb --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_4.txt @@ -0,0 +1 @@ +In this guide, dive deeper into creating a custom model without an AutoClass. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_40.txt b/chunked/nltk_chunking/_create_a_model/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..924f6d3fb0bf53c54fea13911514f0100ca56899 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_40.txt @@ -0,0 +1 @@ +For example, [DistilBertForSequenceClassification] is a base DistilBERT model with a sequence classification head. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_41.txt b/chunked/nltk_chunking/_create_a_model/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d4425b98a6068f2577087301ecb29106e523b48 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_41.txt @@ -0,0 +1 @@ +The sequence classification head is a linear layer on top of the pooled outputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_42.txt b/chunked/nltk_chunking/_create_a_model/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b024d8449a52cdce5c5eee12c567226c8e13e2e --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_42.txt @@ -0,0 +1,4 @@ +from transformers import DistilBertForSequenceClassification +model = DistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + +Easily reuse this checkpoint for another task by switching to a different model head. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_43.txt b/chunked/nltk_chunking/_create_a_model/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..30df6d5e9d1d86a31ad276de122d925d6eb27c61 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_43.txt @@ -0,0 +1 @@ +For a question answering task, you would use the [DistilBertForQuestionAnswering] model head. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_44.txt b/chunked/nltk_chunking/_create_a_model/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e50a5ce6496e036e1f3ebe05d69f08290194da9 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_44.txt @@ -0,0 +1 @@ +The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_45.txt b/chunked/nltk_chunking/_create_a_model/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..96757261b47390e242a5420fd3d769b6435c252c --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_45.txt @@ -0,0 +1,6 @@ +from transformers import DistilBertForQuestionAnswering +model = DistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") +`` + + +For example, [TFDistilBertForSequenceClassification`] is a base DistilBERT model with a sequence classification head. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_46.txt b/chunked/nltk_chunking/_create_a_model/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d4425b98a6068f2577087301ecb29106e523b48 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_46.txt @@ -0,0 +1 @@ +The sequence classification head is a linear layer on top of the pooled outputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_47.txt b/chunked/nltk_chunking/_create_a_model/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ffa2c32bac109440fd747329d086884c8002137 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_47.txt @@ -0,0 +1,4 @@ +from transformers import TFDistilBertForSequenceClassification +tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert/distilbert-base-uncased") + +Easily reuse this checkpoint for another task by switching to a different model head. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_48.txt b/chunked/nltk_chunking/_create_a_model/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a0d727587d77e9e610a194bf1d33355028e725d --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_48.txt @@ -0,0 +1 @@ +For a question answering task, you would use the [TFDistilBertForQuestionAnswering] model head. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_49.txt b/chunked/nltk_chunking/_create_a_model/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e50a5ce6496e036e1f3ebe05d69f08290194da9 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_49.txt @@ -0,0 +1 @@ +The question answering head is similar to the sequence classification head except it is a linear layer on top of the hidden states output. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_5.txt b/chunked/nltk_chunking/_create_a_model/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..77aeb76d5f355db1d048f2e3dbdbe069627820d6 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_5.txt @@ -0,0 +1,3 @@ +Learn how to: + +Load and customize a model configuration. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_50.txt b/chunked/nltk_chunking/_create_a_model/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..face53b4456dc24a3127f88957c799cb0a978719 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_50.txt @@ -0,0 +1,5 @@ +from transformers import TFDistilBertForQuestionAnswering +tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased") + +Tokenizer +The last base class you need before using a model for textual data is a tokenizer to convert raw text to tensors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_51.txt b/chunked/nltk_chunking/_create_a_model/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..997e570c96188aa06c34146132846309042a2c78 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_51.txt @@ -0,0 +1,3 @@ +There are two types of tokenizers you can use with 🤗 Transformers: + +[PreTrainedTokenizer]: a Python implementation of a tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_52.txt b/chunked/nltk_chunking/_create_a_model/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..b165734e06485f0556bec0cf2213ee8f9db92901 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_52.txt @@ -0,0 +1 @@ +[PreTrainedTokenizerFast]: a tokenizer from our Rust-based 🤗 Tokenizer library. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_53.txt b/chunked/nltk_chunking/_create_a_model/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..626be49d984bd6823fb61b28e1391b54451a5f7a --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_53.txt @@ -0,0 +1 @@ +This tokenizer type is significantly faster - especially during batch tokenization - due to its Rust implementation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_54.txt b/chunked/nltk_chunking/_create_a_model/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad69d706a33953b3b5ec0c4ee08bbccd29d2e387 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_54.txt @@ -0,0 +1 @@ +The fast tokenizer also offers additional methods like offset mapping which maps tokens to their original words or characters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_55.txt b/chunked/nltk_chunking/_create_a_model/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ec493ab42bbc79ed157f648002d9f8976f3a664 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_55.txt @@ -0,0 +1 @@ +Both tokenizers support common methods such as encoding and decoding, adding new tokens, and managing special tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_56.txt b/chunked/nltk_chunking/_create_a_model/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..bfb9aa265da79c48cb45872a9f7d8d5e4138eb23 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_56.txt @@ -0,0 +1 @@ +Not every model supports a fast tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_57.txt b/chunked/nltk_chunking/_create_a_model/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a99804c46ea070fc85143bf0ae0841b340e8b14 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_57.txt @@ -0,0 +1 @@ +Take a look at this table to check if a model has fast tokenizer support. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_58.txt b/chunked/nltk_chunking/_create_a_model/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e424c67d10e6d3ce8f16d136f48c6932b1f1d26 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_58.txt @@ -0,0 +1,6 @@ +If you trained your own tokenizer, you can create one from your vocabulary file: + +from transformers import DistilBertTokenizer +my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left") + +It is important to remember the vocabulary from a custom tokenizer will be different from the vocabulary generated by a pretrained model's tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_59.txt b/chunked/nltk_chunking/_create_a_model/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..f418f02b1b72358b666f61296ede1a46471fd37b --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_59.txt @@ -0,0 +1 @@ +You need to use a pretrained model's vocabulary if you are using a pretrained model, otherwise the inputs won't make sense. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_6.txt b/chunked/nltk_chunking/_create_a_model/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c9266da17c92d60ba2ef919c9bd016c0bbbca9c --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_6.txt @@ -0,0 +1 @@ +Create a model architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_60.txt b/chunked/nltk_chunking/_create_a_model/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b4e21935cf193fbbed46dd33cc84baac026402d --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_60.txt @@ -0,0 +1,11 @@ +Create a tokenizer with a pretrained model's vocabulary with the [DistilBertTokenizer] class: + +from transformers import DistilBertTokenizer +slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert/distilbert-base-uncased") + +Create a fast tokenizer with the [DistilBertTokenizerFast] class: + +from transformers import DistilBertTokenizerFast +fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert/distilbert-base-uncased") + +By default, [AutoTokenizer] will try to load a fast tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_61.txt b/chunked/nltk_chunking/_create_a_model/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..e057e29f2f01972dbbbc6dc85b89f57ce7ac27fb --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_61.txt @@ -0,0 +1 @@ +You can disable this behavior by setting use_fast=False in from_pretrained. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_62.txt b/chunked/nltk_chunking/_create_a_model/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..a83b7a025cde6026fe6e11fed9c3bfe6b891a8f9 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_62.txt @@ -0,0 +1,2 @@ +Image processor +An image processor processes vision inputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_63.txt b/chunked/nltk_chunking/_create_a_model/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..789a47defe40d7ba820faf0136a3cba873d08934 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_63.txt @@ -0,0 +1 @@ +It inherits from the base [~image_processing_utils.ImageProcessingMixin] class. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_64.txt b/chunked/nltk_chunking/_create_a_model/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f0405330f8807dcab9c1bcbf19418066a096c15 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_64.txt @@ -0,0 +1 @@ +To use, create an image processor associated with the model you're using. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_65.txt b/chunked/nltk_chunking/_create_a_model/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..07295fe13e27716b49c12f8dfdfcd95b0cfb79dc --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_65.txt @@ -0,0 +1,24 @@ +For example, create a default [ViTImageProcessor] if you are using ViT for image classification: + +from transformers import ViTImageProcessor +vit_extractor = ViTImageProcessor() +print(vit_extractor) +ViTImageProcessor { + "do_normalize": true, + "do_resize": true, + "image_processor_type": "ViTImageProcessor", + "image_mean": [ + 0.5, + 0.5, + 0.5 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": 2, + "size": 224 +} + +If you aren't looking for any customization, just use the from_pretrained method to load a model's default image processor parameters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_66.txt b/chunked/nltk_chunking/_create_a_model/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..5daf6e1080427fb94e560de4a2527f495b63c58f --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_66.txt @@ -0,0 +1,26 @@ +Modify any of the [ViTImageProcessor] parameters to create your custom image processor: + +from transformers import ViTImageProcessor +my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3]) +print(my_vit_extractor) +ViTImageProcessor { + "do_normalize": false, + "do_resize": true, + "image_processor_type": "ViTImageProcessor", + "image_mean": [ + 0.3, + 0.3, + 0.3 + ], + "image_std": [ + 0.5, + 0.5, + 0.5 + ], + "resample": "PIL.Image.BOX", + "size": 224 +} + +Backbone + +Computer vision models consist of a backbone, neck, and head. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_67.txt b/chunked/nltk_chunking/_create_a_model/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..58eae897b478114b3ecffcabc7520a59339ada44 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_67.txt @@ -0,0 +1 @@ +The backbone extracts features from an input image, the neck combines and enhances the extracted features, and the head is used for the main task (e.g., object detection). \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_68.txt b/chunked/nltk_chunking/_create_a_model/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..efd73604c994cafe1894529a451f4d1d30d78d90 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_68.txt @@ -0,0 +1 @@ +Start by initializing a backbone in the model config and specify whether you want to load pretrained weights or load randomly initialized weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_69.txt b/chunked/nltk_chunking/_create_a_model/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..50269d0264cd0f2f18db2c731ea05f4c04a56921 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_69.txt @@ -0,0 +1 @@ +Then you can pass the model config to the model head. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_7.txt b/chunked/nltk_chunking/_create_a_model/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..4551424115fc837608e06d80bc98c4b76f9660d8 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_7.txt @@ -0,0 +1 @@ +Create a slow and fast tokenizer for text. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_70.txt b/chunked/nltk_chunking/_create_a_model/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f1867242ee720617a6f4868172cee13fbd10e9f --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_70.txt @@ -0,0 +1,3 @@ +For example, to load a ResNet backbone into a MaskFormer model with an instance segmentation head: + +Set use_pretrained_backbone=True to load pretrained ResNet weights for the backbone. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_71.txt b/chunked/nltk_chunking/_create_a_model/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..a072ca40d62c804dbb9464fe72de5c24ed0c2840 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_71.txt @@ -0,0 +1,5 @@ +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig +config = MaskFormerConfig(backbone="microsoft/resnet50", use_pretrained_backbone=True) # backbone and neck config +model = MaskFormerForInstanceSegmentation(config) # head + +You could also load the backbone config separately and then pass it to the model config. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_72.txt b/chunked/nltk_chunking/_create_a_model/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..45574163f8f4d896a56b3f9e35919598584b8fa0 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_72.txt @@ -0,0 +1,6 @@ +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig +backbone_config = ResNetConfig.from_pretrained("microsoft/resnet-50") +config = MaskFormerConfig(backbone_config=backbone_config) +model = MaskFormerForInstanceSegmentation(config) + +Set use_pretrained_backbone=False to randomly initialize a ResNet backbone. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_73.txt b/chunked/nltk_chunking/_create_a_model/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..34c8314663b409825911fa0e108bbb7765b4ce56 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_73.txt @@ -0,0 +1,5 @@ +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig +config = MaskFormerConfig(backbone="microsoft/resnet50", use_pretrained_backbone=False) # backbone and neck config +model = MaskFormerForInstanceSegmentation(config) # head + +You could also load the backbone config separately and then pass it to the model config. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_74.txt b/chunked/nltk_chunking/_create_a_model/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3eac9355e114097891b5f1a11b8e322fee0f459 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_74.txt @@ -0,0 +1,6 @@ +from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig +backbone_config = ResNetConfig() +config = MaskFormerConfig(backbone_config=backbone_config) +model = MaskFormerForInstanceSegmentation(config) + +timm models are loaded with [TimmBackbone] and [TimmBackboneConfig]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_75.txt b/chunked/nltk_chunking/_create_a_model/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..7557a4572bdb17aab6b132e76aa9fe5f5c26340a --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_75.txt @@ -0,0 +1,7 @@ +thon +from transformers import TimmBackboneConfig, TimmBackbone +backbone_config = TimmBackboneConfig("resnet50") +model = TimmBackbone(config=backbone_config) + +Feature extractor +A feature extractor processes audio inputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_76.txt b/chunked/nltk_chunking/_create_a_model/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a4e4522eea3431f74b8a1a1bd7a8eb7ae5cb429 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_76.txt @@ -0,0 +1 @@ +It inherits from the base [~feature_extraction_utils.FeatureExtractionMixin] class, and may also inherit from the [SequenceFeatureExtractor] class for processing audio inputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_77.txt b/chunked/nltk_chunking/_create_a_model/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0db70c70af87960c692e3234e7ee1288e76e04d --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_77.txt @@ -0,0 +1 @@ +To use, create a feature extractor associated with the model you're using. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_78.txt b/chunked/nltk_chunking/_create_a_model/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..737642505989a51406dc64fd36321970c78a42b3 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_78.txt @@ -0,0 +1,16 @@ +For example, create a default [Wav2Vec2FeatureExtractor] if you are using Wav2Vec2 for audio classification: + +from transformers import Wav2Vec2FeatureExtractor +w2v2_extractor = Wav2Vec2FeatureExtractor() +print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": true, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 16000 +} + +If you aren't looking for any customization, just use the from_pretrained method to load a model's default feature extractor parameters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_79.txt b/chunked/nltk_chunking/_create_a_model/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..36430d5013e18f37af7f953c682e1bf8009dce7c --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_79.txt @@ -0,0 +1,17 @@ +Modify any of the [Wav2Vec2FeatureExtractor] parameters to create your custom feature extractor: + +from transformers import Wav2Vec2FeatureExtractor +w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False) +print(w2v2_extractor) +Wav2Vec2FeatureExtractor { + "do_normalize": false, + "feature_extractor_type": "Wav2Vec2FeatureExtractor", + "feature_size": 1, + "padding_side": "right", + "padding_value": 0.0, + "return_attention_mask": false, + "sampling_rate": 8000 +} + +Processor +For models that support multimodal tasks, 🤗 Transformers offers a processor class that conveniently wraps processing classes such as a feature extractor and a tokenizer into a single object. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_8.txt b/chunked/nltk_chunking/_create_a_model/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..1aec9f90098e8d28181219bfcde9459c7cb087d7 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_8.txt @@ -0,0 +1 @@ +Create an image processor for vision tasks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_80.txt b/chunked/nltk_chunking/_create_a_model/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..328b2b200f3504090808e7af6c780383277d724e --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_80.txt @@ -0,0 +1 @@ +For example, let's use the [Wav2Vec2Processor] for an automatic speech recognition task (ASR). \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_81.txt b/chunked/nltk_chunking/_create_a_model/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..f42148c21a49296898ac1881a1c92781225189cd --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_81.txt @@ -0,0 +1 @@ +ASR transcribes audio to text, so you will need a feature extractor and a tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_82.txt b/chunked/nltk_chunking/_create_a_model/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..76b4f2ea4427864ba2901a22e7eeeeefe82bb3fb --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_82.txt @@ -0,0 +1,16 @@ +Create a feature extractor to handle the audio inputs: + +from transformers import Wav2Vec2FeatureExtractor +feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True) + +Create a tokenizer to handle the text inputs: + +from transformers import Wav2Vec2CTCTokenizer +tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt") + +Combine the feature extractor and tokenizer in [Wav2Vec2Processor]: + +from transformers import Wav2Vec2Processor +processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) + +With two basic classes - configuration and model - and an additional preprocessing class (tokenizer, image processor, feature extractor, or processor), you can create any of the models supported by 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_83.txt b/chunked/nltk_chunking/_create_a_model/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cf69efa0969d879d6f53f128192efef1c441a68 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_83.txt @@ -0,0 +1 @@ +Each of these base classes are configurable, allowing you to use the specific attributes you want. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_84.txt b/chunked/nltk_chunking/_create_a_model/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..364b7db36cfd45d9cd27e54248c9693fd99e9473 --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_84.txt @@ -0,0 +1 @@ +You can easily setup a model for training or modify an existing pretrained model to fine-tune. \ No newline at end of file diff --git a/chunked/nltk_chunking/_create_a_model/chunk_9.txt b/chunked/nltk_chunking/_create_a_model/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..25bd56e2a4b03824e80fe93bc39818687756ff8e --- /dev/null +++ b/chunked/nltk_chunking/_create_a_model/chunk_9.txt @@ -0,0 +1 @@ +Create a feature extractor for audio tasks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_0.txt b/chunked/nltk_chunking/_custom_models/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..84d48c78dfaf98183e895ef87e13cd137de73509 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_0.txt @@ -0,0 +1,3 @@ + +Building custom models +The 🤗 Transformers library is designed to be easily extensible. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_1.txt b/chunked/nltk_chunking/_custom_models/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..ded5d498a4e689e0f82fdf809c5c19fe1e368a95 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_1.txt @@ -0,0 +1,2 @@ +Every model is fully coded in a given subfolder +of the repository with no abstraction, so you can easily copy a modeling file and tweak it to your needs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_10.txt b/chunked/nltk_chunking/_custom_models/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..5190c19ef8d1074b121788a8c64b7912a25bb25a --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_10.txt @@ -0,0 +1,3 @@ +Writing your model in this +style results in simpler code with a clear "source of truth" for any hyperparameters, and also makes it easier +to reuse code from other models in transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_11.txt b/chunked/nltk_chunking/_custom_models/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..717b02fdc4f2144422f71b1ca1e6428ddfe0cdbd --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_11.txt @@ -0,0 +1 @@ +In our example, we will take a couple of arguments of the ResNet class that we might want to tweak. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_12.txt b/chunked/nltk_chunking/_custom_models/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1c3c9b5ccdc58a3cdde22ae4df16e56d1b8125e --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_12.txt @@ -0,0 +1,2 @@ +Different +configurations will then give us the different types of ResNets that are possible. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_13.txt b/chunked/nltk_chunking/_custom_models/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c46304bc9fbb5236f9afd4573b76998d4d9f9f9 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_13.txt @@ -0,0 +1,2 @@ +We then just store those arguments, +after checking the validity of a few of them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_14.txt b/chunked/nltk_chunking/_custom_models/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..9261c323fa296699aee28b11a9e79c2445bfb61d --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_14.txt @@ -0,0 +1,20 @@ +thon +from transformers import PretrainedConfig +from typing import List +class ResnetConfig(PretrainedConfig): + model_type = "resnet" +def __init__( + self, + block_type="bottleneck", + layers: List[int] = [3, 4, 6, 3], + num_classes: int = 1000, + input_channels: int = 3, + cardinality: int = 1, + base_width: int = 64, + stem_width: int = 64, + stem_type: str = "", + avg_down: bool = False, + **kwargs, +): + if block_type not in ["basic", "bottleneck"]: + raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_15.txt b/chunked/nltk_chunking/_custom_models/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e6d9eccd75c215c6ec4986aa94b5eefe981e578 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_15.txt @@ -0,0 +1,2 @@ +if stem_type not in ["", "deep", "deep-tiered"]: + raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_16.txt b/chunked/nltk_chunking/_custom_models/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3a802d4f30fc25ef13c92b9c561428b4670d33e --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_16.txt @@ -0,0 +1,15 @@ +self.block_type = block_type + self.layers = layers + self.num_classes = num_classes + self.input_channels = input_channels + self.cardinality = cardinality + self.base_width = base_width + self.stem_width = stem_width + self.stem_type = stem_type + self.avg_down = avg_down + super().__init__(**kwargs) + +The three important things to remember when writing you own configuration are the following: +- you have to inherit from PretrainedConfig, +- the __init__ of your PretrainedConfig must accept any kwargs, +- those kwargs need to be passed to the superclass __init__. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_17.txt b/chunked/nltk_chunking/_custom_models/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1c34461458d50cc5f578b018784aa73d0cf66c3 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_17.txt @@ -0,0 +1,2 @@ +The inheritance is to make sure you get all the functionality from the 🤗 Transformers library, while the two other +constraints come from the fact a PretrainedConfig has more fields than the ones you are setting. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_18.txt b/chunked/nltk_chunking/_custom_models/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..762e651a2eb2bf04ccc266ad12ff269a455e969d --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_18.txt @@ -0,0 +1,3 @@ +When reloading a +config with the from_pretrained method, those fields need to be accepted by your config and then sent to the +superclass. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_19.txt b/chunked/nltk_chunking/_custom_models/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfc2c4fc520ab7a7c3b876fea289872fa17974c6 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_19.txt @@ -0,0 +1,2 @@ +Defining a model_type for your configuration (here model_type="resnet") is not mandatory, unless you want to +register your model with the auto classes (see last section). \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_2.txt b/chunked/nltk_chunking/_custom_models/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..4662920a6b5cbecfd54d6965a1a882fe34f195f7 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_2.txt @@ -0,0 +1 @@ +If you are writing a brand new model, it might be easier to start from scratch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_20.txt b/chunked/nltk_chunking/_custom_models/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba8d219f486cefd8b46c02024315408350d2ce45 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_20.txt @@ -0,0 +1,2 @@ +With this done, you can easily create and save your configuration like you would do with any other model config of the +library. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_21.txt b/chunked/nltk_chunking/_custom_models/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b2d9d873c4205a3c9e3de8822254c520b2869b4 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_21.txt @@ -0,0 +1,5 @@ +Here is how we can create a resnet50d config and save it: +py +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d_config.save_pretrained("custom-resnet") +This will save a file named config.json inside the folder custom-resnet. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_22.txt b/chunked/nltk_chunking/_custom_models/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9883a3dd1260f9dd86415e0f4b99a6f6296aeab --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_22.txt @@ -0,0 +1,6 @@ +You can then reload your config with the +from_pretrained method: +py +resnet50d_config = ResnetConfig.from_pretrained("custom-resnet") +You can also use any other method of the [PretrainedConfig] class, like [~PretrainedConfig.push_to_hub] to +directly upload your config to the Hub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_23.txt b/chunked/nltk_chunking/_custom_models/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..626338767702f12683a1e8a5b9dcfde220e95f60 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_23.txt @@ -0,0 +1,2 @@ +Writing a custom model +Now that we have our ResNet configuration, we can go on writing the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_24.txt b/chunked/nltk_chunking/_custom_models/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..b07f93b4f634fecf07da6fb4ab8334e23ba854c7 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_24.txt @@ -0,0 +1,3 @@ +We will actually write two: one that +extracts the hidden features from a batch of images (like [BertModel]) and one that is suitable for image +classification (like [BertForSequenceClassification]). \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_25.txt b/chunked/nltk_chunking/_custom_models/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..457d18aab49143ae21b9a24844cc34083d53b3d6 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_25.txt @@ -0,0 +1 @@ +As we mentioned before, we'll only write a loose wrapper of the model to keep it simple for this example. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_26.txt b/chunked/nltk_chunking/_custom_models/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..51b823ce764c4df1d67b339cfb102d00fe4c8f16 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_26.txt @@ -0,0 +1,2 @@ +The only +thing we need to do before writing this class is a map between the block types and actual block classes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_27.txt b/chunked/nltk_chunking/_custom_models/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ab10b738ac7a2e5741d666ba6ffe333843c782d --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_27.txt @@ -0,0 +1,56 @@ +Then the +model is defined from the configuration by passing everything to the ResNet class: + +from transformers import PreTrainedModel +from timm.models.resnet import BasicBlock, Bottleneck, ResNet +from .configuration_resnet import ResnetConfig +BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck} +class ResnetModel(PreTrainedModel): + config_class = ResnetConfig +def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + +def forward(self, tensor): + return self.model.forward_features(tensor) + +For the model that will classify images, we just change the forward method: + +import torch +class ResnetModelForImageClassification(PreTrainedModel): + config_class = ResnetConfig +def __init__(self, config): + super().__init__(config) + block_layer = BLOCK_MAPPING[config.block_type] + self.model = ResNet( + block_layer, + config.layers, + num_classes=config.num_classes, + in_chans=config.input_channels, + cardinality=config.cardinality, + base_width=config.base_width, + stem_width=config.stem_width, + stem_type=config.stem_type, + avg_down=config.avg_down, + ) + +def forward(self, tensor, labels=None): + logits = self.model(tensor) + if labels is not None: + loss = torch.nn.cross_entropy(logits, labels) + return {"loss": loss, "logits": logits} + return {"logits": logits} + +In both cases, notice how we inherit from PreTrainedModel and call the superclass initialization with the config +(a bit like when you write a regular torch.nn.Module). \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_28.txt b/chunked/nltk_chunking/_custom_models/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e0909605cf7389d06b581131205d840dae88330 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_28.txt @@ -0,0 +1,2 @@ +The line that sets the config_class is not mandatory, unless +you want to register your model with the auto classes (see last section). \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_29.txt b/chunked/nltk_chunking/_custom_models/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..b20cdd3718fd60dfa9d21f5094dc1984b3716887 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_29.txt @@ -0,0 +1 @@ +If your model is very similar to a model inside the library, you can re-use the same configuration as this model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_3.txt b/chunked/nltk_chunking/_custom_models/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..65dbb2ecce9c3847bc9885b9af75af32d080166d --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_3.txt @@ -0,0 +1,4 @@ +In this tutorial, we will show you +how to write a custom model and its configuration so it can be used inside Transformers, and how you can share it +with the community (with the code it relies on) so that anyone can use it, even if it's not present in the 🤗 +Transformers library. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_30.txt b/chunked/nltk_chunking/_custom_models/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..7833b881b0145203b829defacfef1d06b0c5d4d8 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_30.txt @@ -0,0 +1,3 @@ +You can have your model return anything you want, but returning a dictionary like we did for +ResnetModelForImageClassification, with the loss included when labels are passed, will make your model directly +usable inside the [Trainer] class. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_31.txt b/chunked/nltk_chunking/_custom_models/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2bf94fcf9966f93a60b6b7b24f2b2fb74730da8 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_31.txt @@ -0,0 +1,2 @@ +Using another output format is fine as long as you are planning on using your own +training loop or another library for training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_32.txt b/chunked/nltk_chunking/_custom_models/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb301901bcff31c952758b5deecf0367c25cc283 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_32.txt @@ -0,0 +1,5 @@ +Now that we have our model class, let's create one: +py +resnet50d = ResnetModelForImageClassification(resnet50d_config) +Again, you can use any of the methods of [PreTrainedModel], like [~PreTrainedModel.save_pretrained] or +[~PreTrainedModel.push_to_hub]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_33.txt b/chunked/nltk_chunking/_custom_models/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..963035269bdab3ec7c8cfd0939d6fe74a4b23d98 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_33.txt @@ -0,0 +1,2 @@ +We will use the second in the next section, and see how to push the model weights +with the code of our model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_34.txt b/chunked/nltk_chunking/_custom_models/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cd21e5dcee9afff9ad1615db02a62e9a58cb7f2 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_34.txt @@ -0,0 +1 @@ +But first, let's load some pretrained weights inside our model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_35.txt b/chunked/nltk_chunking/_custom_models/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdbfc219357dc4715f57155066f040c926a0b898 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_35.txt @@ -0,0 +1 @@ +In your own use case, you will probably be training your custom model on your own data. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_36.txt b/chunked/nltk_chunking/_custom_models/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cb2eb2ed8b421904e3d796705bc71a6dc030a0e --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_36.txt @@ -0,0 +1,2 @@ +To go fast for this tutorial, +we will use the pretrained version of the resnet50d. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_37.txt b/chunked/nltk_chunking/_custom_models/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..186c076255dcb122192626a852dd44e4c8db8402 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_37.txt @@ -0,0 +1,9 @@ +Since our model is just a wrapper around it, it's going to be +easy to transfer those weights: + +import timm +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) + +Now let's see how to make sure that when we do [~PreTrainedModel.save_pretrained] or [~PreTrainedModel.push_to_hub], the +code of the model is saved. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_38.txt b/chunked/nltk_chunking/_custom_models/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..14fceb13fe41f7b9caed0bf6915f1bc019342057 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_38.txt @@ -0,0 +1,3 @@ +Registering a model with custom code to the auto classes +If you are writing a library that extends 🤗 Transformers, you may want to extend the auto classes to include your own +model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_39.txt b/chunked/nltk_chunking/_custom_models/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fc6f2baf40b37d4f6265a680fbf7ffdb962fe39 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_39.txt @@ -0,0 +1,2 @@ +This is different from pushing the code to the Hub in the sense that users will need to import your library to +get the custom models (contrarily to automatically downloading the model code from the Hub). \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_4.txt b/chunked/nltk_chunking/_custom_models/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..e90c2cd39e7b14ccd7c8a8869dce65314248fd46 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_4.txt @@ -0,0 +1,2 @@ +We'll see how to build upon transformers and extend the framework with your hooks and +custom code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_40.txt b/chunked/nltk_chunking/_custom_models/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..0696d7f2f09103780e4178ae89305c5125b1bf56 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_40.txt @@ -0,0 +1,11 @@ +As long as your config has a model_type attribute that is different from existing model types, and that your model +classes have the right config_class attributes, you can just add them to the auto classes like this: + +from transformers import AutoConfig, AutoModel, AutoModelForImageClassification +AutoConfig.register("resnet", ResnetConfig) +AutoModel.register(ResnetConfig, ResnetModel) +AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification) + +Note that the first argument used when registering your custom config to [AutoConfig] needs to match the model_type +of your custom config, and the first argument used when registering your custom models to any auto model class needs +to match the config_class of those models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_41.txt b/chunked/nltk_chunking/_custom_models/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba57f250eda4571e97662cb99d5f94309d244991 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_41.txt @@ -0,0 +1,3 @@ +Sending the code to the Hub + +This API is experimental and may have some slight breaking changes in the next releases. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_42.txt b/chunked/nltk_chunking/_custom_models/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce40a6b5e23218b8deab6b8bdeff852f2bd72fd2 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_42.txt @@ -0,0 +1 @@ +First, make sure your model is fully defined in a .py file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_43.txt b/chunked/nltk_chunking/_custom_models/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..39de8ca09a502833270896b7fca89d909b093f0e --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_43.txt @@ -0,0 +1,2 @@ +It can rely on relative imports to some other files as +long as all the files are in the same directory (we don't support submodules for this feature yet). \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_44.txt b/chunked/nltk_chunking/_custom_models/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..92fa01477bdd674b531610c5788b2661624bcad9 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_44.txt @@ -0,0 +1,3 @@ +For our example, +we'll define a modeling_resnet.py file and a configuration_resnet.py file in a folder of the current working +directory named resnet_model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_45.txt b/chunked/nltk_chunking/_custom_models/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..007e57cfacc8ab822f13d656251b1221d9b83c6d --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_45.txt @@ -0,0 +1,2 @@ +The configuration file contains the code for ResnetConfig and the modeling file +contains the code of ResnetModel and ResnetModelForImageClassification. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_46.txt b/chunked/nltk_chunking/_custom_models/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..945c9b46d684f08ec84cb316e1dc0061e361f794 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_46.txt @@ -0,0 +1 @@ +. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_47.txt b/chunked/nltk_chunking/_custom_models/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..34ee01ffe2bb80de1cac90f577c974f381c70af8 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_47.txt @@ -0,0 +1,5 @@ +└── resnet_model + ├── __init__.py + ├── configuration_resnet.py + └── modeling_resnet.py +The __init__.py can be empty, it's just there so that Python detects resnet_model can be use as a module. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_48.txt b/chunked/nltk_chunking/_custom_models/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..57dcb6e18bf66b576589b7b1eeb612d21eb8ce54 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_48.txt @@ -0,0 +1,2 @@ +If copying a modeling files from the library, you will need to replace all the relative imports at the top of the file +to import from the transformers package. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_49.txt b/chunked/nltk_chunking/_custom_models/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b6b528398412aa207368e51d601e84a2115675f --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_49.txt @@ -0,0 +1 @@ +Note that you can re-use (or subclass) an existing configuration/model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_5.txt b/chunked/nltk_chunking/_custom_models/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..feb64eb4adaae75edcbbc0303b9544e3f2ed12af --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_5.txt @@ -0,0 +1,2 @@ +We will illustrate all of this on a ResNet model, by wrapping the ResNet class of the +timm library into a [PreTrainedModel]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_50.txt b/chunked/nltk_chunking/_custom_models/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..c68b29a2447b14ec6938fbbec335d5d2df3a48b2 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_50.txt @@ -0,0 +1,13 @@ +To share your model with the community, follow those steps: first import the ResNet model and config from the newly +created files: +py +from resnet_model.configuration_resnet import ResnetConfig +from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification +Then you have to tell the library you want to copy the code files of those objects when using the save_pretrained +method and properly register them with a given Auto class (especially for models), just run: +py +ResnetConfig.register_for_auto_class() +ResnetModel.register_for_auto_class("AutoModel") +ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification") +Note that there is no need to specify an auto class for the configuration (there is only one auto class for them, +[AutoConfig]) but it's different for models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_51.txt b/chunked/nltk_chunking/_custom_models/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0eacce0e500f41d1dfaf752c9672f21ea287f03 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_51.txt @@ -0,0 +1,2 @@ +Your custom model could be suitable for many different tasks, so you +have to specify which one of the auto classes is the correct one for your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_52.txt b/chunked/nltk_chunking/_custom_models/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..36e30b2bbbd20765ea2ca934bd5ebe3917807263 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_52.txt @@ -0,0 +1 @@ +Use register_for_auto_class() if you want the code files to be copied. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_53.txt b/chunked/nltk_chunking/_custom_models/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..83f2badedda0c460ad2e193dbb49f537cec58ce6 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_53.txt @@ -0,0 +1,2 @@ +If you instead prefer to use code on the Hub from another repo, +you don't need to call it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_54.txt b/chunked/nltk_chunking/_custom_models/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..14f703570299acd4ac390bff49164e43403a62d6 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_54.txt @@ -0,0 +1,17 @@ +In cases where there's more than one auto class, you can modify the config.json directly using the +following structure: +json +"auto_map": { + "AutoConfig": "--", + "AutoModel": "--", + "AutoModelFor": "--", +}, + +Next, let's create the config and models as we did before: + +resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True) +resnet50d = ResnetModelForImageClassification(resnet50d_config) +pretrained_model = timm.create_model("resnet50d", pretrained=True) +resnet50d.model.load_state_dict(pretrained_model.state_dict()) + +Now to send the model to the Hub, make sure you are logged in. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_55.txt b/chunked/nltk_chunking/_custom_models/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f82fc911849b1b3aa24a68d2a656fe24293d8bc --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_55.txt @@ -0,0 +1,13 @@ +Either run in your terminal: + +huggingface-cli login +or from a notebook: + +from huggingface_hub import notebook_login +notebook_login() + +You can then push to your own namespace (or an organization you are a member of) like this: +py +resnet50d.push_to_hub("custom-resnet50d") +On top of the modeling weights and the configuration in json format, this also copied the modeling and +configuration .py files in the folder custom-resnet50d and uploaded the result to the Hub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_56.txt b/chunked/nltk_chunking/_custom_models/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bf7563c2086bad8e6beebd33692326e99da78e1 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_56.txt @@ -0,0 +1,2 @@ +You can check the result +in this model repo. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_57.txt b/chunked/nltk_chunking/_custom_models/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..576aef6eb6dced784bb27480e6ea5fc800e97554 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_57.txt @@ -0,0 +1 @@ +See the sharing tutorial for more information on the push to Hub method. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_58.txt b/chunked/nltk_chunking/_custom_models/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..c865df126bdda61cf553e1fdc6a181f0c5be83b0 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_58.txt @@ -0,0 +1,3 @@ +Using a model with custom code +You can use any configuration, model or tokenizer with custom code files in its repository with the auto-classes and +the from_pretrained method. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_59.txt b/chunked/nltk_chunking/_custom_models/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0da27216e9cd7b3debe9fd461e48dad75967ec8 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_59.txt @@ -0,0 +1,2 @@ +All files and code uploaded to the Hub are scanned for malware (refer to the Hub security documentation for more information), but you should still +review the model code and author to avoid executing malicious code on your machine. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_6.txt b/chunked/nltk_chunking/_custom_models/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e0e734901ee316d5ea161f992f9123c28dd45a2 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_6.txt @@ -0,0 +1,2 @@ +Writing a custom configuration +Before we dive into the model, let's first write its configuration. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_60.txt b/chunked/nltk_chunking/_custom_models/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..ceee2c3cf869986ad1edb7618526fe8c722478bd --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_60.txt @@ -0,0 +1,8 @@ +Set trust_remote_code=True to use +a model with custom code: + +from transformers import AutoModelForImageClassification +model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True) + +It is also strongly encouraged to pass a commit hash as a revision to make sure the author of the models did not +update the code with some malicious new lines (unless you fully trust the authors of the models). \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_61.txt b/chunked/nltk_chunking/_custom_models/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd2ed8d4ff320606a803462caba3b1faa579c114 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_61.txt @@ -0,0 +1,7 @@ +py +commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292" +model = AutoModelForImageClassification.from_pretrained( + "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash +) +Note that when browsing the commit history of the model repo on the Hub, there is a button to easily copy the commit +hash of any commit. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_7.txt b/chunked/nltk_chunking/_custom_models/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4e7bb83d6af3e842a33d3e3ecb3415a87e92e99 --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_7.txt @@ -0,0 +1,2 @@ +The configuration of a model is an object that +will contain all the necessary information to build the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_8.txt b/chunked/nltk_chunking/_custom_models/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..050d18ecf1159ad996a128bb4438ef03721b1c2b --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_8.txt @@ -0,0 +1,2 @@ +As we will see in the next section, the model can only +take a config to be initialized, so we really need that object to be as complete as possible. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_models/chunk_9.txt b/chunked/nltk_chunking/_custom_models/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ba5eac16671bd11d4aa38a93c3aa6ae54c205eb --- /dev/null +++ b/chunked/nltk_chunking/_custom_models/chunk_9.txt @@ -0,0 +1,3 @@ +Models in the transformers library itself generally follow the convention that they accept a config object +in their __init__ method, and then pass the whole config to sub-layers in the model, rather than breaking the +config object into multiple arguments that are all passed individually to sub-layers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_0.txt b/chunked/nltk_chunking/_custom_tools/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7ec3656cc89330db9d3970c2d01c5a2a5225d67 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_0.txt @@ -0,0 +1,5 @@ + +Custom Tools and Prompts + +If you are not aware of what tools and agents are in the context of transformers, we recommend you read the +Transformers Agents page first. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_1.txt b/chunked/nltk_chunking/_custom_tools/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2b0c05ec800c4d7fbad186ade3a767d81a87f39 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_1.txt @@ -0,0 +1 @@ +Transformers Agents is an experimental API that is subject to change at any time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_10.txt b/chunked/nltk_chunking/_custom_tools/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..73ca7122d05d06606614d3a6980c1f7981f5c6cb --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_10.txt @@ -0,0 +1 @@ +The prompt is structured broadly into four parts. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_100.txt b/chunked/nltk_chunking/_custom_tools/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c3d6db429c8df1718ba6b37d3895a7008625116 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_100.txt @@ -0,0 +1 @@ +We can help the agent here by changing the tool name and description of image_transformer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_101.txt b/chunked/nltk_chunking/_custom_tools/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..f81bf0689fa0d7a1e321d31a621c240064fac3bd --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_101.txt @@ -0,0 +1,8 @@ +Let's instead call it modifier +to disassociate it a bit from "image" and "prompt": +py +agent.toolbox["modifier"] = agent.toolbox.pop("image_transformer") +agent.toolbox["modifier"].description = agent.toolbox["modifier"].description.replace( + "transforms an image according to a prompt", "modifies an image" +) +Now "modify" is a strong cue to use the new image processor which should help with the above prompt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_102.txt b/chunked/nltk_chunking/_custom_tools/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccc9f146e5d69850e671753cca47b091a62022d0 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_102.txt @@ -0,0 +1 @@ +Let's run it again. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_103.txt b/chunked/nltk_chunking/_custom_tools/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..9dc056c2bf3abe29494a280fd63cf5780c6b39f9 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_103.txt @@ -0,0 +1,6 @@ +py +agent.run("Make an image of a house and a car", return_code=True) +Now we're getting: +``text +==Explanation from the agent== +I will use the following tools:image_generatorto generate an image of a house, thenimage_generator` to generate an image of a car. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_104.txt b/chunked/nltk_chunking/_custom_tools/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..784f408dfebfdecffeb85e8e056641769dee14e2 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_104.txt @@ -0,0 +1,5 @@ +==Code generated by the agent== +house_image = image_generator(prompt="A house") +car_image = image_generator(prompt="A car") + +which is definitely closer to what we had in mind! \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_105.txt b/chunked/nltk_chunking/_custom_tools/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..f131a2088d774566a2db262d70baf6273a0eb3eb --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_105.txt @@ -0,0 +1 @@ +However, we want to have both the house and car in the same image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_106.txt b/chunked/nltk_chunking/_custom_tools/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8062b9e755fe6f114b483031b2360545f59b75b --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_106.txt @@ -0,0 +1,6 @@ +Steering the task more toward single image generation should help: +py +agent.run("Create image: 'A house and car'", return_code=True) +``text +==Explanation from the agent== +I will use the following tool:image_generator` to generate an image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_107.txt b/chunked/nltk_chunking/_custom_tools/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..63dce6486ef2371291a50a884e478aa462fb0ee2 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_107.txt @@ -0,0 +1,5 @@ +==Code generated by the agent== +image = image_generator(prompt="A house and car") + +Agents are still brittle for many use cases, especially when it comes to +slightly more complex use cases like generating an image of multiple objects. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_108.txt b/chunked/nltk_chunking/_custom_tools/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..23d48d3cfc4db4d24d831421abe9ac3b4cd05112 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_108.txt @@ -0,0 +1,2 @@ +Both the agent itself and the underlying prompt will be further improved in the coming +months making sure that agents become more robust to a variety of user inputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_109.txt b/chunked/nltk_chunking/_custom_tools/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f9399c4a8007c74ac5d1caec3ffc68dbe9f27aa --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_109.txt @@ -0,0 +1,3 @@ +Customizing the whole prompt +To give the user maximum flexibility, the whole prompt template as explained in above +can be overwritten by the user. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_11.txt b/chunked/nltk_chunking/_custom_tools/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..22c606919ad2b13d783d024f283b0b6dd34e1b4a --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_11.txt @@ -0,0 +1 @@ +Introduction: how the agent should behave, explanation of the concept of tools. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_110.txt b/chunked/nltk_chunking/_custom_tools/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd0cf33a8595f48685734b3b9c83f081ea19007e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_110.txt @@ -0,0 +1,2 @@ +In this case make sure that your custom prompt includes an introduction section, +a tool section, an example section, and an unfinished example section. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_111.txt b/chunked/nltk_chunking/_custom_tools/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa0a8c9e4c99867cfe6f57c59992f70727cb0985 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_111.txt @@ -0,0 +1,8 @@ +If you want to overwrite the run prompt template, +you can do as follows: + +template = """ [] """ +agent = HfAgent(your_endpoint, run_prompt_template=template) + +Please make sure to have the <> string and the <> defined somewhere in the template so that the agent can be aware +of the tools, it has available to it as well as correctly insert the user's prompt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_112.txt b/chunked/nltk_chunking/_custom_tools/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b015696944745a5be2c794c91083c2d2bbba471 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_112.txt @@ -0,0 +1 @@ +Similarly, one can overwrite the chat prompt template. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_113.txt b/chunked/nltk_chunking/_custom_tools/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7b8be6b2447ef1ce2d45bbb14911f4ebced6e31 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_113.txt @@ -0,0 +1,6 @@ +Note that the chat mode always uses the following format for the exchanges: +```text +Human: <> +Assistant: + +Therefore it is important that the examples of the custom chat prompt template also make use of this format. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_114.txt b/chunked/nltk_chunking/_custom_tools/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f7e47039b4da9bfe59dd1f8ccafb8c157ffaefb --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_114.txt @@ -0,0 +1 @@ +You can overwrite the chat template at instantiation as follows. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_115.txt b/chunked/nltk_chunking/_custom_tools/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac95de28060b781134710ebb9908b98aab8b153e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_115.txt @@ -0,0 +1,6 @@ +thon +template = """ [] """ +agent = HfAgent(url_endpoint=your_endpoint, chat_prompt_template=template) + +Please make sure to have the <> string defined somewhere in the template so that the agent can be aware +of the tools, it has available to it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_116.txt b/chunked/nltk_chunking/_custom_tools/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd93e541d0b272d56fd3b61c12017194a3537d85 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_116.txt @@ -0,0 +1 @@ +In both cases, you can pass a repo ID instead of the prompt template if you would like to use a template hosted by someone in the community. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_117.txt b/chunked/nltk_chunking/_custom_tools/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c00ebc7bc8a1febe433553e10f1cbfad0e625b8 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_117.txt @@ -0,0 +1 @@ +The default prompts live in this repo as an example. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_118.txt b/chunked/nltk_chunking/_custom_tools/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce78dea7b44494ab4bb1f183544992bd41dfa059 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_118.txt @@ -0,0 +1,10 @@ +To upload your custom prompt on a repo on the Hub and share it with the community just make sure: +- to use a dataset repository +- to put the prompt template for the run command in a file named run_prompt_template.txt +- to put the prompt template for the chat command in a file named chat_prompt_template.txt +Using custom tools +In this section, we'll be leveraging two existing custom tools that are specific to image generation: + +We replace huggingface-tools/image-transformation, + with diffusers/controlnet-canny-tool + to allow for more image modifications. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_119.txt b/chunked/nltk_chunking/_custom_tools/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc7a63fc0c833a94cf4553d5feeaa9335e9959c3 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_119.txt @@ -0,0 +1,2 @@ +We add a new tool for image upscaling to the default toolbox: + diffusers/latent-upscaler-tool replace the existing image-transformation tool. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_12.txt b/chunked/nltk_chunking/_custom_tools/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e9680598724720368a653e8aa2346895093144e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_12.txt @@ -0,0 +1 @@ +Description of all the tools. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_120.txt b/chunked/nltk_chunking/_custom_tools/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..b8eff38f88c1c01c2c4740c577f0f23bbd6d6f27 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_120.txt @@ -0,0 +1,8 @@ +We'll start by loading the custom tools with the convenient [load_tool] function: + +from transformers import load_tool +controlnet_transformer = load_tool("diffusers/controlnet-canny-tool") +upscaler = load_tool("diffusers/latent-upscaler-tool") + +Upon adding custom tools to an agent, the tools' descriptions and names are automatically +included in the agents' prompts. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_121.txt b/chunked/nltk_chunking/_custom_tools/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..f74b02672c2baab94870a4f1d2cc1d23938ea16d --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_121.txt @@ -0,0 +1,2 @@ +Thus, it is imperative that custom tools have +a well-written description and name in order for the agent to understand how to use them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_122.txt b/chunked/nltk_chunking/_custom_tools/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e49db1b66d4f0aa698222cd6d2cc542f8d4a2cd --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_122.txt @@ -0,0 +1,7 @@ +Let's take a look at the description and name of controlnet_transformer: +py +print(f"Description: '{controlnet_transformer.description}'") +print(f"Name: '{controlnet_transformer.name}'") +gives +text +Description: 'This is a tool that transforms an image with ControlNet according to a prompt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_123.txt b/chunked/nltk_chunking/_custom_tools/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..0040646ecccd126cd51780fbf5b3404aa4793b2c --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_123.txt @@ -0,0 +1 @@ +It takes two inputs: `image`, which should be the image to transform, and `prompt`, which should be the prompt to use to change it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_124.txt b/chunked/nltk_chunking/_custom_tools/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..5158cc25255d432898e514acdf2397772be5c62e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_124.txt @@ -0,0 +1 @@ +It returns the modified image.' \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_125.txt b/chunked/nltk_chunking/_custom_tools/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..b66eaab8a7003f66847aa83a00016972a6c7e997 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_125.txt @@ -0,0 +1,2 @@ +Name: 'image_transformer' +The name and description are accurate and fit the style of the curated set of tools. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_126.txt b/chunked/nltk_chunking/_custom_tools/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..11620ca42b33c06cbafeab678120d4e9e3f21c23 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_126.txt @@ -0,0 +1,9 @@ +Next, let's instantiate an agent with controlnet_transformer and upscaler: +py +tools = [controlnet_transformer, upscaler] +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=tools) +This command should give you the following info: +text +image_transformer has been replaced by as provided in `additional_tools` +The set of curated tools already has an image_transformer tool which is hereby replaced with our custom tool. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_127.txt b/chunked/nltk_chunking/_custom_tools/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..32c1635554b42a557fdd2d2b38875974a6be286c --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_127.txt @@ -0,0 +1,2 @@ +Overwriting existing tools can be beneficial if we want to use a custom tool exactly for the same task as an existing tool +because the agent is well-versed in using the specific task. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_128.txt b/chunked/nltk_chunking/_custom_tools/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff05d761cd16912fed74122f055014d12993e428 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_128.txt @@ -0,0 +1,3 @@ +Beware that the custom tool should follow the exact same API +as the overwritten tool in this case, or you should adapt the prompt template to make sure all examples using that +tool are updated. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_129.txt b/chunked/nltk_chunking/_custom_tools/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f5c3eb464ff6abca18a486db6f166b7bf6919a3 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_129.txt @@ -0,0 +1 @@ +The upscaler tool was given the name image_upscaler which is not yet present in the default toolbox and is therefore simply added to the list of tools. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_13.txt b/chunked/nltk_chunking/_custom_tools/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a32bbdff054ebe294d04cb3860556c04a27806f --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_13.txt @@ -0,0 +1 @@ +This is defined by a <> token that is dynamically replaced at runtime with the tools defined/chosen by the user. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_130.txt b/chunked/nltk_chunking/_custom_tools/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..df99f0e4433adf993a2c811d47d32d965f25cfd4 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_130.txt @@ -0,0 +1,20 @@ +You can always have a look at the toolbox that is currently available to the agent via the agent.toolbox attribute: +py +print("\n".join([f"- {a}" for a in agent.toolbox.keys()])) +text +- document_qa +- image_captioner +- image_qa +- image_segmenter +- transcriber +- summarizer +- text_classifier +- text_qa +- text_reader +- translator +- image_transformer +- text_downloader +- image_generator +- video_generator +- image_upscaler +Note how image_upscaler is now part of the agents' toolbox. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_131.txt b/chunked/nltk_chunking/_custom_tools/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..c56ff38ad75bcde8185216c62ef635fdd51409f2 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_131.txt @@ -0,0 +1 @@ +Let's now try out the new tools! \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_132.txt b/chunked/nltk_chunking/_custom_tools/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a92c1c3540f4662c2f8a18c2df2a940c49298e6 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_132.txt @@ -0,0 +1 @@ +We will re-use the image we generated in Transformers Agents Quickstart. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_133.txt b/chunked/nltk_chunking/_custom_tools/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..7392bddbcfac83a42d238e771f4b496e9a355c68 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_133.txt @@ -0,0 +1,12 @@ +from diffusers.utils import load_image +image = load_image( + "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" +) + + +Let's transform the image into a beautiful winter landscape: +py +image = agent.run("Transform the image: 'A frozen lake and snowy forest'", image=image) +``text +==Explanation from the agent== +I will use the following tool:image_transformer` to transform the image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_134.txt b/chunked/nltk_chunking/_custom_tools/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..a999c6f26af1386d84def9bce00f2dd8a0911f8b --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_134.txt @@ -0,0 +1,5 @@ +==Code generated by the agent== +image = image_transformer(image, prompt="A frozen lake and snowy forest") + + +The new image processing tool is based on ControlNet which can make very strong modifications to the image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_135.txt b/chunked/nltk_chunking/_custom_tools/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..61090f858e5579f2dc73cd3af48d6c078a0d6186 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_135.txt @@ -0,0 +1 @@ +By default the image processing tool returns an image of size 512x512 pixels. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_136.txt b/chunked/nltk_chunking/_custom_tools/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..be9dc045ca1aa2a3ac6bc2651002c242e7ba2d2a --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_136.txt @@ -0,0 +1 @@ +Let's see if we can upscale it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_137.txt b/chunked/nltk_chunking/_custom_tools/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8749ddf6a9024530d69fdcc678a0c6bd6671e0b --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_137.txt @@ -0,0 +1,5 @@ +py +image = agent.run("Upscale the image", image) +``text +==Explanation from the agent== +I will use the following tool:image_upscaler` to upscale the image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_138.txt b/chunked/nltk_chunking/_custom_tools/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6aee9ed429b1584b1e329f20e439417ffe9ea18 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_138.txt @@ -0,0 +1,6 @@ +==Code generated by the agent== +upscaled_image = image_upscaler(image) + + +The agent automatically mapped our prompt "Upscale the image" to the just added upscaler tool purely based on the description and name of the upscaler tool +and was able to correctly run it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_139.txt b/chunked/nltk_chunking/_custom_tools/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa898eafcc8dc4387cf599fb1163907e5ac9da70 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_139.txt @@ -0,0 +1 @@ +Next, let's have a look at how you can create a new custom tool. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_14.txt b/chunked/nltk_chunking/_custom_tools/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a996fcfc25fba44f143f371a54d86b299d39882 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_14.txt @@ -0,0 +1,3 @@ +A set of examples of tasks and their solution + +Current example, and request for solution. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_140.txt b/chunked/nltk_chunking/_custom_tools/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e9efb749fc698fb310035fc3760bd3fb8196cea --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_140.txt @@ -0,0 +1,2 @@ +Adding new tools +In this section, we show how to create a new tool that can be added to the agent. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_141.txt b/chunked/nltk_chunking/_custom_tools/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c11212837eeba3b661d65eda101e48df09c883f --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_141.txt @@ -0,0 +1,2 @@ +Creating a new tool +We'll first start by creating a tool. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_142.txt b/chunked/nltk_chunking/_custom_tools/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..227229d3990650c6f70ef0bdf49e5ecd6f9a77a9 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_142.txt @@ -0,0 +1,2 @@ +We'll add the not-so-useful yet fun task of fetching the model on the Hugging Face +Hub with the most downloads for a given task. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_143.txt b/chunked/nltk_chunking/_custom_tools/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..db453e4704233e8665f5a3b927e7ce15c0239497 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_143.txt @@ -0,0 +1,8 @@ +We can do that with the following code: +thon +from huggingface_hub import list_models +task = "text-classification" +model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) +print(model.id) + +For the task text-classification, this returns 'facebook/bart-large-mnli', for translation it returns 'google-t5/t5-base. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_144.txt b/chunked/nltk_chunking/_custom_tools/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..16059592c1a7d31c04ac8d277aac96ac75afc9e1 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_144.txt @@ -0,0 +1 @@ +How do we convert this to a tool that the agent can leverage? \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_145.txt b/chunked/nltk_chunking/_custom_tools/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e3219ba88fe28bbeec86600c899bde6bb9f13a9 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_145.txt @@ -0,0 +1,2 @@ +All tools depend on the superclass Tool that holds the +main attributes necessary. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_146.txt b/chunked/nltk_chunking/_custom_tools/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f02ad6fd309b047ec2b7e66382bd77a31b4eef5 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_146.txt @@ -0,0 +1,8 @@ +We'll create a class that inherits from it: +thon +from transformers import Tool +class HFModelDownloadsTool(Tool): + pass + +This class has a few needs: +- An attribute name, which corresponds to the name of the tool itself. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_147.txt b/chunked/nltk_chunking/_custom_tools/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbc5e02112904ae8deba41131eac932a6fcbe490 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_147.txt @@ -0,0 +1,2 @@ +To be in tune with other tools which have a + performative name, we'll name it model_download_counter. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_148.txt b/chunked/nltk_chunking/_custom_tools/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..56863af689cec71e5c7e066ef013c46a9e1a9d5d --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_148.txt @@ -0,0 +1 @@ +- An attribute description, which will be used to populate the prompt of the agent. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_149.txt b/chunked/nltk_chunking/_custom_tools/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..77476cafe2d93c31a6e279359e2f04a58097fcd6 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_149.txt @@ -0,0 +1 @@ +- inputs and outputs attributes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_15.txt b/chunked/nltk_chunking/_custom_tools/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..a84c697b7e776fa3e9e7d0b02094c91623037d69 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_15.txt @@ -0,0 +1,3 @@ +To better understand each part, let's look at a shortened version of how the run prompt can look like: +````text +I will ask you to perform a task, your job is to come up with a series of simple commands in Python that will perform the task. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_150.txt b/chunked/nltk_chunking/_custom_tools/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff9deef77bebdcea5ed50e957bc8c4b47c75f300 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_150.txt @@ -0,0 +1,2 @@ +Defining this will help the python interpreter make educated choices about types, + and will allow for a gradio-demo to be spawned when we push our tool to the Hub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_151.txt b/chunked/nltk_chunking/_custom_tools/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..0042fbca5f8af7a8fec966a29bdca3187fbf0804 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_151.txt @@ -0,0 +1,2 @@ +They're both a list of expected + values, which can be text, image, or audio. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_152.txt b/chunked/nltk_chunking/_custom_tools/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..e97aabcfaaf39ea0a9fee9bed912ffb121747de2 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_152.txt @@ -0,0 +1 @@ +- A __call__ method which contains the inference code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_153.txt b/chunked/nltk_chunking/_custom_tools/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f8343bb6ab911af649f486fa2f49dda42b9c5f3 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_153.txt @@ -0,0 +1 @@ +This is the code we've played with above! \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_154.txt b/chunked/nltk_chunking/_custom_tools/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4a7f78c94ba6498b6e005cde26f3b6de4a9231c --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_154.txt @@ -0,0 +1,8 @@ +Here's what our class looks like now: +thon +from transformers import Tool +from huggingface_hub import list_models +class HFModelDownloadsTool(Tool): + name = "model_download_counter" + description = ( + "This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub. " \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_155.txt b/chunked/nltk_chunking/_custom_tools/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..734c8f349706176556b800de4103f47b5f5a3f05 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_155.txt @@ -0,0 +1,2 @@ +"It takes the name of the category (such as text-classification, depth-estimation, etc), and " + "returns the name of the checkpoint." \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_156.txt b/chunked/nltk_chunking/_custom_tools/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..09efbcb7da483dd41db003f830444b5786d28362 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_156.txt @@ -0,0 +1,9 @@ +) +inputs = ["text"] +outputs = ["text"] + +def __call__(self, task: str): + model = next(iter(list_models(filter=task, sort="downloads", direction=-1))) + return model.id + +We now have our tool handy. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_157.txt b/chunked/nltk_chunking/_custom_tools/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..90ea5bfb8e2dd3e543ed03ca57bff9aeab387a38 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_157.txt @@ -0,0 +1 @@ +Save it in a file and import it from your main script. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_158.txt b/chunked/nltk_chunking/_custom_tools/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a2c69f47a96dfa3d478c0352fd37beccb6caca5 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_158.txt @@ -0,0 +1,8 @@ +Let's name this file +model_downloads.py, so the resulting import code looks like this: +thon +from model_downloads import HFModelDownloadsTool +tool = HFModelDownloadsTool() + +In order to let others benefit from it and for simpler initialization, we recommend pushing it to the Hub under your +namespace. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_159.txt b/chunked/nltk_chunking/_custom_tools/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b48caae3bb87b96b13f7c8b70bd809ecdcd5826 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_159.txt @@ -0,0 +1,4 @@ +To do so, just call push_to_hub on the tool variable: +python +tool.push_to_hub("hf-model-downloads") +You now have your code on the Hub! \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_16.txt b/chunked/nltk_chunking/_custom_tools/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..815379be26bb8013cb5c14cb0dd1252156a75221 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_16.txt @@ -0,0 +1,2 @@ +[] +You can print intermediate results if it makes sense to do so. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_160.txt b/chunked/nltk_chunking/_custom_tools/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..8503cb9323ff2d347a3a55394eb12ff44c58377c --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_160.txt @@ -0,0 +1 @@ +Let's take a look at the final step, which is to have the agent use it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_161.txt b/chunked/nltk_chunking/_custom_tools/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3be8096fb7871c6429cdef2a89802da1aacb8be --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_161.txt @@ -0,0 +1,12 @@ +Having the agent use the tool +We now have our tool that lives on the Hub which can be instantiated as such (change the user name for your tool): +thon +from transformers import load_tool +tool = load_tool("lysandre/hf-model-downloads") + +In order to use it in the agent, simply pass it in the additional_tools parameter of the agent initialization method: +thon +from transformers import HfAgent +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool]) +agent.run( + "Can you read out loud the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?" \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_162.txt b/chunked/nltk_chunking/_custom_tools/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..510b1727c14d6c4951a882be4812f145832682c0 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_162.txt @@ -0,0 +1,5 @@ +) +which outputs the following:text +==Code generated by the agent== +model = model_download_counter(task="text-to-video") +print(f"The model with the most downloads is {model}.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_163.txt b/chunked/nltk_chunking/_custom_tools/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a2821ceb1f6e0e2d058c0b3bac9fcf7bd225418 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_163.txt @@ -0,0 +1,3 @@ +audio_model = text_reader(model) +==Result== +The model with the most downloads is damo-vilab/text-to-video-ms-1.7b. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_164.txt b/chunked/nltk_chunking/_custom_tools/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..175cb23a2f87bca8af3a9e7f9b394b6ef319bf58 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_164.txt @@ -0,0 +1 @@ +and generates the following audio. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_165.txt b/chunked/nltk_chunking/_custom_tools/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b8808dc03aa37d16dd19245d8567f9e3909f299 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_165.txt @@ -0,0 +1,5 @@ +| Audio | +|------------------------------------------------------------------------------------------------------------------------------------------------------| +| | + +Depending on the LLM, some are quite brittle and require very exact prompts in order to work well. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_166.txt b/chunked/nltk_chunking/_custom_tools/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f972cae920a1273dbf33ce09462e011c0a3730f --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_166.txt @@ -0,0 +1,2 @@ +Having a well-defined +name and description of the tool is paramount to having it be leveraged by the agent. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_167.txt b/chunked/nltk_chunking/_custom_tools/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a39b1565949dec96a8aebdbb5e21435fb172e30 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_167.txt @@ -0,0 +1,2 @@ +Replacing existing tools +Replacing existing tools can be done simply by assigning a new item to the agent's toolbox. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_168.txt b/chunked/nltk_chunking/_custom_tools/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..180313b925d4ed7b12efa6f0639d358bd043cd66 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_168.txt @@ -0,0 +1,7 @@ +Here's how one would do so: +thon +from transformers import HfAgent, load_tool +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder") +agent.toolbox["image-transformation"] = load_tool("diffusers/controlnet-canny-tool") + +Beware when replacing tools with others! \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_169.txt b/chunked/nltk_chunking/_custom_tools/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc315c041aa3d3ed9375bded35a2fe61b4c6f3bc --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_169.txt @@ -0,0 +1 @@ +This will also adjust the agent's prompt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_17.txt b/chunked/nltk_chunking/_custom_tools/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c626f25e88c4ae741e84d7ddbefbb84d73f9417 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_17.txt @@ -0,0 +1,2 @@ +Tools: +- document_qa: This is a tool that answers a question about a document (pdf). \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_170.txt b/chunked/nltk_chunking/_custom_tools/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a94dd3afea4562f6c2a669c85a750f906e6f67e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_170.txt @@ -0,0 +1,3 @@ +This can be good if you have a better +prompt suited for the task, but it can also result in your tool being selected way more than others or for other +tools to be selected instead of the one you have defined. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_171.txt b/chunked/nltk_chunking/_custom_tools/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..899714223552df2692fd012d1091eb4a0f386f00 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_171.txt @@ -0,0 +1,3 @@ +Leveraging gradio-tools +gradio-tools is a powerful library that allows using Hugging +Face Spaces as tools. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_172.txt b/chunked/nltk_chunking/_custom_tools/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..6326adfdad8dbe3486358d981bcf7c799348ffdc --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_172.txt @@ -0,0 +1 @@ +It supports many existing Spaces as well as custom Spaces to be designed with it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_173.txt b/chunked/nltk_chunking/_custom_tools/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..da21b39ec93175c18fd3d1f791175c54d1be97ca --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_173.txt @@ -0,0 +1 @@ +We offer support for gradio_tools by using the Tool.from_gradio method. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_174.txt b/chunked/nltk_chunking/_custom_tools/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..fb7adc607be28f6ec21a27cd6a35e9ceba30fdb8 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_174.txt @@ -0,0 +1,3 @@ +For example, we want to take +advantage of the StableDiffusionPromptGeneratorTool tool offered in the gradio-tools toolkit so as to +improve our prompts and generate better images. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_175.txt b/chunked/nltk_chunking/_custom_tools/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..1abf320ba64651d09dadb41a9d81076815262cf0 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_175.txt @@ -0,0 +1,11 @@ +We first import the tool from gradio_tools and instantiate it: +thon +from gradio_tools import StableDiffusionPromptGeneratorTool +gradio_tool = StableDiffusionPromptGeneratorTool() + +We pass that instance to the Tool.from_gradio method: +thon +from transformers import Tool +tool = Tool.from_gradio(gradio_tool) + +Now we can manage it exactly as we would a usual custom tool. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_176.txt b/chunked/nltk_chunking/_custom_tools/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6e1b3af162f32403b88401a506740a35d3ed483 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_176.txt @@ -0,0 +1,6 @@ +We leverage it to improve our prompt +a rabbit wearing a space suit: +thon +from transformers import HfAgent +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder", additional_tools=[tool]) +agent.run("Generate an image of the prompt after improving it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_177.txt b/chunked/nltk_chunking/_custom_tools/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..421623f3a93f83bc16d0cd4948fc196d706a93b4 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_177.txt @@ -0,0 +1,6 @@ +", prompt="A rabbit wearing a space suit") + +The model adequately leverages the tool: +``text +==Explanation from the agent== +I will use the following tools:StableDiffusionPromptGeneratorto improve the prompt, thenimage_generator` to generate an image according to the improved prompt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_178.txt b/chunked/nltk_chunking/_custom_tools/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..a59a16e179efeab9cdefce2ea6946335502f8034 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_178.txt @@ -0,0 +1,3 @@ +==Code generated by the agent== +improved_prompt = StableDiffusionPromptGenerator(prompt) +print(f"The improved prompt is {improved_prompt}.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_179.txt b/chunked/nltk_chunking/_custom_tools/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0b6ffaa2fac29a109e976859b132fe8f0cd1fb0 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_179.txt @@ -0,0 +1,5 @@ +image = image_generator(improved_prompt) + +Before finally generating the image: + +gradio-tools requires textual inputs and outputs, even when working with different modalities. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_18.txt b/chunked/nltk_chunking/_custom_tools/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..df611b11effb7a961e530838f0e47ea4e496e13e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_18.txt @@ -0,0 +1 @@ +It takes an input named document which should be the document containing the information, as well as a question that is the question about the document. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_180.txt b/chunked/nltk_chunking/_custom_tools/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ea3907baf0db4284d4d7ee90173e1c45c05d3dc --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_180.txt @@ -0,0 +1,2 @@ +This implementation +works with image and audio objects. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_181.txt b/chunked/nltk_chunking/_custom_tools/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..986021f7c22df3ea11092631726e6920b07a149a --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_181.txt @@ -0,0 +1,2 @@ +The two are currently incompatible, but will rapidly become compatible as we +work to improve the support. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_182.txt b/chunked/nltk_chunking/_custom_tools/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..b79ab4d162f61568f733ea15c6d24a4f11effc7b --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_182.txt @@ -0,0 +1,2 @@ +Future compatibility with Langchain +We love Langchain and think it has a very compelling suite of tools. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_183.txt b/chunked/nltk_chunking/_custom_tools/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..15ea353a8a5e6656aa422ef240806fd736a1117e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_183.txt @@ -0,0 +1,2 @@ +In order to handle these tools, +Langchain requires textual inputs and outputs, even when working with different modalities. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_184.txt b/chunked/nltk_chunking/_custom_tools/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f092e653bc2af4d5b66b3a6d97fb04aadc12d62 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_184.txt @@ -0,0 +1 @@ +This is often the serialized version (i.e., saved to disk) of the objects. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_185.txt b/chunked/nltk_chunking/_custom_tools/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..445818f2d898f9236c5cf8e6b9ad97ff22d3eaec --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_185.txt @@ -0,0 +1 @@ +This difference means that multi-modality isn't handled between transformers-agents and langchain. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_186.txt b/chunked/nltk_chunking/_custom_tools/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf77006f51001ef93a17fddab60532c79fe9539d --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_186.txt @@ -0,0 +1,2 @@ +We aim for this limitation to be resolved in future versions, and welcome any help from avid langchain +users to help us achieve this compatibility. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_187.txt b/chunked/nltk_chunking/_custom_tools/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e57a1b6b331f5c26ebe6105caaa8a48c835127f --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_187.txt @@ -0,0 +1 @@ +We would love to have better support. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_188.txt b/chunked/nltk_chunking/_custom_tools/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..68055618d355682fb1c80849150adc37c06e5b33 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_188.txt @@ -0,0 +1,2 @@ +If you would like to help, please +open an issue and share what you have in mind. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_19.txt b/chunked/nltk_chunking/_custom_tools/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..75e95826fe61e961f8def5300ade5b3a2b1d507a --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_19.txt @@ -0,0 +1 @@ +It returns a text that contains the answer to the question. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_2.txt b/chunked/nltk_chunking/_custom_tools/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f936cad0340e322b51acf772723df92143e9160 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_2.txt @@ -0,0 +1,2 @@ +Results returned by the agents +can vary as the APIs or underlying models are prone to change. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_20.txt b/chunked/nltk_chunking/_custom_tools/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5a866dbfe788d94ad1fabfa862bfd85afe9d2e2 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_20.txt @@ -0,0 +1 @@ +- image_captioner: This is a tool that generates a description of an image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_21.txt b/chunked/nltk_chunking/_custom_tools/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..be5409038a0a45b3bc8501e9c62d53b02aa1fd60 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_21.txt @@ -0,0 +1 @@ +It takes an input named image which should be the image to the caption and returns a text that contains the description in English. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_22.txt b/chunked/nltk_chunking/_custom_tools/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb11b32f3c5b209bf7dbb7c565575ba87dd55399 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_22.txt @@ -0,0 +1,2 @@ +[] +Task: "Answer the question in the variable question about the image stored in the variable image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_23.txt b/chunked/nltk_chunking/_custom_tools/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b0c1c88c35041ea89c74616e9d2a88473da9d64 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_23.txt @@ -0,0 +1 @@ +The question is in French." \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_24.txt b/chunked/nltk_chunking/_custom_tools/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..5011526fe57945195ad8cca0159939d719619136 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_24.txt @@ -0,0 +1 @@ +I will use the following tools: translator to translate the question into English and then image_qa to answer the question on the input image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_25.txt b/chunked/nltk_chunking/_custom_tools/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fb843bed4bf5e1a0a6b79df7b3f875d35bff9e3 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_25.txt @@ -0,0 +1,4 @@ +Answer: +py +translated_question = translator(question=question, src_lang="French", tgt_lang="English") +print(f"The translated question is {translated_question}.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_26.txt b/chunked/nltk_chunking/_custom_tools/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d832b6b0431ce6aef78b67c31a0f5d120773c29d --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_26.txt @@ -0,0 +1,3 @@ +answer = image_qa(image=image, question=translated_question) +print(f"The answer is {answer}") +Task: "Identify the oldest person in the document and create an image showcasing the result as a banner." \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_27.txt b/chunked/nltk_chunking/_custom_tools/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..5bc47beb9eb89a7bcc7aa371c25587a01d8b1d86 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_27.txt @@ -0,0 +1 @@ +I will use the following tools: document_qa to find the oldest person in the document, then image_generator to generate an image according to the answer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_28.txt b/chunked/nltk_chunking/_custom_tools/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a54e1abcdd439bc6787957bc00d20c32ca7bc78 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_28.txt @@ -0,0 +1,3 @@ +Answer: +py +answer = document_qa(document, question="What is the oldest person?") \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_29.txt b/chunked/nltk_chunking/_custom_tools/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b4f701941aaae4f74ed93192f9d388b0493de4d --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_29.txt @@ -0,0 +1 @@ +print(f"The answer is {answer}.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_3.txt b/chunked/nltk_chunking/_custom_tools/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..80c26ee45c13f38a8e216ffc38d558552f66f17d --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_3.txt @@ -0,0 +1 @@ +Creating and using custom tools and prompts is paramount to empowering the agent and having it perform new tasks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_30.txt b/chunked/nltk_chunking/_custom_tools/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..e92935f90af69f12f185903a0be16232bc08195e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_30.txt @@ -0,0 +1,6 @@ +image = image_generator("A banner showing " + answer) +[] +Task: "Draw me a picture of rivers and lakes" +I will use the following +` +The introduction (the text before "Tools:") explains precisely how the model shall behave and what it should do. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_31.txt b/chunked/nltk_chunking/_custom_tools/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..f49f1978f1f6bc32279b6e4f2f455d968aac9da8 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_31.txt @@ -0,0 +1 @@ +This part most likely does not need to be customized as the agent shall always behave the same way. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_32.txt b/chunked/nltk_chunking/_custom_tools/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..89af2cc2a6b2846d6667fde28de255b729d049eb --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_32.txt @@ -0,0 +1 @@ +The second part (the bullet points below "Tools") is dynamically added upon calling run or chat. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_33.txt b/chunked/nltk_chunking/_custom_tools/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3ae1c05d752c4dcbe42fb9a7ae3556029f4803a --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_33.txt @@ -0,0 +1,6 @@ +There are +exactly as many bullet points as there are tools in agent.toolbox and each bullet point consists of the name +and description of the tool: +text +- : +Let's verify this quickly by loading the document_qa tool and printing out the name and description. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_34.txt b/chunked/nltk_chunking/_custom_tools/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e9cf281ca7b5bf9497b77032cdd409905da5e49 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_34.txt @@ -0,0 +1,7 @@ +from transformers import load_tool +document_qa = load_tool("document-question-answering") +print(f"- {document_qa.name}: {document_qa.description}") + +which gives: +text +- document_qa: This is a tool that answers a question about a document (pdf). \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_35.txt b/chunked/nltk_chunking/_custom_tools/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..20cc5137fc86ebb9b278da1dd6b8bb04587ca979 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_35.txt @@ -0,0 +1 @@ +It takes an input named `document` which should be the document containing the information, as well as a `question` that is the question about the document. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_36.txt b/chunked/nltk_chunking/_custom_tools/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..75e95826fe61e961f8def5300ade5b3a2b1d507a --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_36.txt @@ -0,0 +1 @@ +It returns a text that contains the answer to the question. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_37.txt b/chunked/nltk_chunking/_custom_tools/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d8885bddc5cee365184578bcec6094d8742fee5 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_37.txt @@ -0,0 +1 @@ +We can see that the tool name is short and precise. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_38.txt b/chunked/nltk_chunking/_custom_tools/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..2af5ec8766b22139be0781425280e055cb3e979f --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_38.txt @@ -0,0 +1,2 @@ +The description includes two parts, the first explaining +what the tool does and the second states what input arguments and return values are expected. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_39.txt b/chunked/nltk_chunking/_custom_tools/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..a4d165dcbef0f525ada9b9cbc093add99bebaa2a --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_39.txt @@ -0,0 +1 @@ +A good tool name and tool description are very important for the agent to correctly use it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_4.txt b/chunked/nltk_chunking/_custom_tools/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..f24ab3aac8aef787d72db63d55266efbc1757b61 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_4.txt @@ -0,0 +1,8 @@ +In this guide we'll take a look at: + +How to customize the prompt +How to use custom tools +How to create custom tools + +Customizing the prompt +As explained in Transformers Agents agents can run in [~Agent.run] and [~Agent.chat] mode. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_40.txt b/chunked/nltk_chunking/_custom_tools/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7fb4eada401c775821c81a840d86297341e6a6a --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_40.txt @@ -0,0 +1,3 @@ +Note that the only +information the agent has about the tool is its name and description, so one should make sure that both +are precisely written and match the style of the existing tools in the toolbox. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_41.txt b/chunked/nltk_chunking/_custom_tools/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..4311ea7e1198216a07c5392fefd9f62257a8402a --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_41.txt @@ -0,0 +1,3 @@ +In particular make sure the description +mentions all the arguments expected by name in code-style, along with the expected type and a description of what they +are. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_42.txt b/chunked/nltk_chunking/_custom_tools/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cb22ea0db7ac95dc44cec53c095597b71ad288a --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_42.txt @@ -0,0 +1,2 @@ +Check the naming and description of the curated Transformers tools to better understand what name and +description a tool is expected to have. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_43.txt b/chunked/nltk_chunking/_custom_tools/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..4842c3dd757ecbe1c5545f409f76413081bac070 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_43.txt @@ -0,0 +1 @@ +You can see all tools with the [Agent.toolbox] property. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_44.txt b/chunked/nltk_chunking/_custom_tools/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ba5122c99589d2e9b5bbce241fb973dca15182b --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_44.txt @@ -0,0 +1,2 @@ +The third part includes a set of curated examples that show the agent exactly what code it should produce +for what kind of user request. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_45.txt b/chunked/nltk_chunking/_custom_tools/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..18c5f3229a341a3924fd50f7692688e282f3871e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_45.txt @@ -0,0 +1,2 @@ +The large language models empowering the agent are extremely good at +recognizing patterns in a prompt and repeating the pattern with new data. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_46.txt b/chunked/nltk_chunking/_custom_tools/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7d103872a94c71fd32a0de55cf5ce2052c18e9c --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_46.txt @@ -0,0 +1,3 @@ +Therefore, it is very important +that the examples are written in a way that maximizes the likelihood of the agent to generating correct, +executable code in practice. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_47.txt b/chunked/nltk_chunking/_custom_tools/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..a64ab77ce7a97628a2bff9904b5c744318437286 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_47.txt @@ -0,0 +1,3 @@ +Let's have a look at one example: +```text +Task: "Identify the oldest person in thedocument` and create an image showcasing the result as a banner." \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_48.txt b/chunked/nltk_chunking/_custom_tools/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..5bc47beb9eb89a7bcc7aa371c25587a01d8b1d86 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_48.txt @@ -0,0 +1 @@ +I will use the following tools: document_qa to find the oldest person in the document, then image_generator to generate an image according to the answer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_49.txt b/chunked/nltk_chunking/_custom_tools/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a54e1abcdd439bc6787957bc00d20c32ca7bc78 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_49.txt @@ -0,0 +1,3 @@ +Answer: +py +answer = document_qa(document, question="What is the oldest person?") \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_5.txt b/chunked/nltk_chunking/_custom_tools/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..7faf8505deea86da172b13409a2afebe7f05444b --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_5.txt @@ -0,0 +1 @@ +Both the run and chat modes underlie the same logic. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_50.txt b/chunked/nltk_chunking/_custom_tools/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b4f701941aaae4f74ed93192f9d388b0493de4d --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_50.txt @@ -0,0 +1 @@ +print(f"The answer is {answer}.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_51.txt b/chunked/nltk_chunking/_custom_tools/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..664a3143b60cd192462faec7bebf59cbad06142e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_51.txt @@ -0,0 +1,4 @@ +image = image_generator("A banner showing " + answer) +` +The pattern the model is prompted to repeat has three parts: The task statement, the agent's explanation of +what it intends to do, and finally the generated code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_52.txt b/chunked/nltk_chunking/_custom_tools/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..726bae94ce0214de1bc450c50b0cac071703d1c8 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_52.txt @@ -0,0 +1,2 @@ +Every example that is part of the prompt has this exact +pattern, thus making sure that the agent will reproduce exactly the same pattern when generating new tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_53.txt b/chunked/nltk_chunking/_custom_tools/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..72c6678a86f74f7559b918c2fe7e228bf498d54f --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_53.txt @@ -0,0 +1,3 @@ +The prompt examples are curated by the Transformers team and rigorously evaluated on a set of +problem statements +to ensure that the agent's prompt is as good as possible to solve real use cases of the agent. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_54.txt b/chunked/nltk_chunking/_custom_tools/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..f807461687952e2051500d058eb13b4c9110d9a0 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_54.txt @@ -0,0 +1,6 @@ +The final part of the prompt corresponds to: +```text +Task: "Draw me a picture of rivers and lakes" +I will use the following + +is a final and unfinished example that the agent is tasked to complete. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_55.txt b/chunked/nltk_chunking/_custom_tools/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..00c7f2337e15cac9e1050e43e80f951ade114151 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_55.txt @@ -0,0 +1,2 @@ +The unfinished example +is dynamically created based on the actual user input. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_56.txt b/chunked/nltk_chunking/_custom_tools/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..08dee344b8a5a79d08284c50fc56619441a89783 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_56.txt @@ -0,0 +1,5 @@ +For the above example, the user ran: +py +agent.run("Draw me a picture of rivers and lakes") +The user input - a.k.a the task: "Draw me a picture of rivers and lakes" is cast into the +prompt template: "Task: \n\n I will use the following". \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_57.txt b/chunked/nltk_chunking/_custom_tools/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..567a53e59fd63f011f5bd506aef3ef117fb53733 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_57.txt @@ -0,0 +1,3 @@ +This sentence makes up the final lines of the +prompt the agent is conditioned on, therefore strongly influencing the agent to finish the example +exactly in the same way it was previously done in the examples. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_58.txt b/chunked/nltk_chunking/_custom_tools/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9278799fea2aa57e72446adfe8f9451f8fdc362 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_58.txt @@ -0,0 +1,2 @@ +Without going into too much detail, the chat template has the same prompt structure with the +examples having a slightly different style, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_59.txt b/chunked/nltk_chunking/_custom_tools/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a83bc83507f32d83e6c992b558f0eed503a58ee --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_59.txt @@ -0,0 +1,5 @@ +: +````text +[] +===== +Human: Answer the question in the variable question about the image stored in the variable image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_6.txt b/chunked/nltk_chunking/_custom_tools/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fe3750b2ac8fa68c953eed055447c53ed74f865 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_6.txt @@ -0,0 +1,2 @@ +The language model powering the agent is conditioned on a long +prompt and completes the prompt by generating the next tokens until the stop token is reached. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_60.txt b/chunked/nltk_chunking/_custom_tools/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d20a12e5f620f08b5b3848df642a50b8ac4c21e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_60.txt @@ -0,0 +1 @@ +Assistant: I will use the tool image_qa to answer the question on the input image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_61.txt b/chunked/nltk_chunking/_custom_tools/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..245820a01fb92bc40ef9a0256c4d024c8afcd49d --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_61.txt @@ -0,0 +1,4 @@ +py +answer = image_qa(text=question, image=image) +print(f"The answer is {answer}") +Human: I tried this code, it worked but didn't give me a good result. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_62.txt b/chunked/nltk_chunking/_custom_tools/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6e107ffea240293835abe1acc39b3d5d679d525 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_62.txt @@ -0,0 +1,2 @@ +The question is in French +Assistant: In this case, the question needs to be translated first. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_63.txt b/chunked/nltk_chunking/_custom_tools/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..b93dac47df1b09c3254401ae734a37ab3d8f6511 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_63.txt @@ -0,0 +1 @@ +I will use the tool translator to do this. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_64.txt b/chunked/nltk_chunking/_custom_tools/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbbae20d61258fa6f1d5dc7ac775c6156e87b294 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_64.txt @@ -0,0 +1,3 @@ +py +translated_question = translator(question=question, src_lang="French", tgt_lang="English") +print(f"The translated question is {translated_question}.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_65.txt b/chunked/nltk_chunking/_custom_tools/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd890add4fc3a1001f6fa25855786f2a14717dca --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_65.txt @@ -0,0 +1,7 @@ +answer = image_qa(text=translated_question, image=image) +print(f"The answer is {answer}") +===== +[] +` +Contrary, to the examples of the run prompt, each chat prompt example has one or more exchanges between the +Human and the Assistant. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_66.txt b/chunked/nltk_chunking/_custom_tools/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..30535c2f1043d90ae42204273d06e52ca8d8a1d4 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_66.txt @@ -0,0 +1 @@ +Every exchange is structured similarly to the example of the run prompt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_67.txt b/chunked/nltk_chunking/_custom_tools/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0fba436966fa62cd8c5d7b3e54e93cfde3704c5 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_67.txt @@ -0,0 +1,2 @@ +The user's input is appended to behind Human: and the agent is prompted to first generate what needs to be done +before generating code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_68.txt b/chunked/nltk_chunking/_custom_tools/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..a50d9acf39efc1f11d399b8bd1f2a679f5790160 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_68.txt @@ -0,0 +1,2 @@ +An exchange can be based on previous exchanges, therefore allowing the user to refer +to past exchanges as is done e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_69.txt b/chunked/nltk_chunking/_custom_tools/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7201c275f89e0d7cfed82bbe44fbe422760527f --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_69.txt @@ -0,0 +1,2 @@ +above by the user's input of "I tried this code" refers to the +previously generated code of the agent. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_7.txt b/chunked/nltk_chunking/_custom_tools/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..43d712d8f1e94d231ca2796433e5156374a782a1 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_7.txt @@ -0,0 +1,2 @@ +The only difference between the two modes is that during the chat mode the prompt is extended with +previous user inputs and model generations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_70.txt b/chunked/nltk_chunking/_custom_tools/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..adbbe08db743f049859273b68404ee1e8520a1c9 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_70.txt @@ -0,0 +1,4 @@ +Upon running .chat, the user's input or task is cast into an unfinished example of the form: +text +Human: \n\nAssistant: +which the agent completes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_71.txt b/chunked/nltk_chunking/_custom_tools/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b13c062364b5495dac85ba07f3eef9cd070849f --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_71.txt @@ -0,0 +1,2 @@ +Contrary to the run command, the chat command then appends the completed example +to the prompt, thus giving the agent more context for the next chat turn. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_72.txt b/chunked/nltk_chunking/_custom_tools/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..979cfe51bf47b503d768bb0332eaf4cf23cc8cef --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_72.txt @@ -0,0 +1 @@ +Great now that we know how the prompt is structured, let's see how we can customize it! \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_73.txt b/chunked/nltk_chunking/_custom_tools/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebfad7baac6a19eda34c1abda64aa4c7e330b859 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_73.txt @@ -0,0 +1,3 @@ +Writing good user inputs +While large language models are getting better and better at understanding users' intentions, it helps +enormously to be as precise as possible to help the agent pick the correct task. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_74.txt b/chunked/nltk_chunking/_custom_tools/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..80cd447c8293d17f2a5e78c17bb542bfd913a85a --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_74.txt @@ -0,0 +1,2 @@ +What does it mean to be +as precise as possible? \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_75.txt b/chunked/nltk_chunking/_custom_tools/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..a897ac1782def702c19726d965869e147df5d823 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_75.txt @@ -0,0 +1 @@ +The agent sees a list of tool names and their description in its prompt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_76.txt b/chunked/nltk_chunking/_custom_tools/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..8128510d7dd068bf6d65960c02f51ed71ccf9dbd --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_76.txt @@ -0,0 +1,3 @@ +The more tools are added the +more difficult it becomes for the agent to choose the correct tool and it's even more difficult to choose +the correct sequences of tools to run. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_77.txt b/chunked/nltk_chunking/_custom_tools/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..f225bb3e1f2d1cad8e025ef63bea0144c056c768 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_77.txt @@ -0,0 +1,2 @@ +Let's look at a common failure case, here we will only return +the code to analyze it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_78.txt b/chunked/nltk_chunking/_custom_tools/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..96f24f905fd608bd75a2bd9f9a9ef39d65918282 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_78.txt @@ -0,0 +1,8 @@ +from transformers import HfAgent +agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder") +agent.run("Show me a tree", return_code=True) + +gives: +``text +==Explanation from the agent== +I will use the following tool:image_segmenter` to create a segmentation mask for the image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_79.txt b/chunked/nltk_chunking/_custom_tools/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4cf8c4bf192e5782fd9710c2cdb8c3617d6fd2d --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_79.txt @@ -0,0 +1,4 @@ +==Code generated by the agent== +mask = image_segmenter(image, prompt="tree") + +which is probably not what we wanted. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_8.txt b/chunked/nltk_chunking/_custom_tools/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..0263b68cc2f0e525f53f768169091bcc412b7e5e --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_8.txt @@ -0,0 +1,2 @@ +This allows the agent to have access to past interactions, +seemingly giving the agent some kind of memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_80.txt b/chunked/nltk_chunking/_custom_tools/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..12816d233a63a831248f0e024aec5c78cbfb9e5b --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_80.txt @@ -0,0 +1 @@ +Instead, it is more likely that we want an image of a tree to be generated. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_81.txt b/chunked/nltk_chunking/_custom_tools/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..1576b1bed1ced3266aebe3d97b16ea89a23e8a22 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_81.txt @@ -0,0 +1,2 @@ +To steer the agent more towards using a specific tool it can therefore be very helpful to use important keywords that +are present in the tool's name and description. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_82.txt b/chunked/nltk_chunking/_custom_tools/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..8211825d737ce6ed745054395daaa441294a3d72 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_82.txt @@ -0,0 +1 @@ +Let's have a look. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_83.txt b/chunked/nltk_chunking/_custom_tools/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..c69e2fee291a48502dcbd7f881311d149ba73aff --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_83.txt @@ -0,0 +1,4 @@ +py +agent.toolbox["image_generator"].description +text +'This is a tool that creates an image according to a prompt, which is a text description. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_84.txt b/chunked/nltk_chunking/_custom_tools/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0aab639a2aff1d34d26bbf78611e11b8971ba9d --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_84.txt @@ -0,0 +1 @@ +It takes an input named `prompt` which contains the image description and outputs an image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_85.txt b/chunked/nltk_chunking/_custom_tools/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..18cc00af22f085c3dee90116e0986249886c6888 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_85.txt @@ -0,0 +1 @@ +The name and description make use of the keywords "image", "prompt", "create" and "generate". \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_86.txt b/chunked/nltk_chunking/_custom_tools/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b21f4e64c1f4397c0442fad02cdabf10ced8f45 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_86.txt @@ -0,0 +1 @@ +Using these words will most likely work better here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_87.txt b/chunked/nltk_chunking/_custom_tools/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6426407ffd6f3038ba8f5647900c628e29e4590 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_87.txt @@ -0,0 +1 @@ +Let's refine our prompt a bit. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_88.txt b/chunked/nltk_chunking/_custom_tools/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..a30dcdbbcfde72ce95da52e0ee2a911da65554ae --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_88.txt @@ -0,0 +1,6 @@ +py +agent.run("Create an image of a tree", return_code=True) +gives: +``text +==Explanation from the agent== +I will use the following toolimage_generator` to generate an image of a tree. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_89.txt b/chunked/nltk_chunking/_custom_tools/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9425a167ee86844b06d5da4f5ca39aa21e9ca73 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_89.txt @@ -0,0 +1,4 @@ +==Code generated by the agent== +image = image_generator(prompt="tree") + +Much better! \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_9.txt b/chunked/nltk_chunking/_custom_tools/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..894be03fb55b88ba130d05a6b327dad41d3663c8 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_9.txt @@ -0,0 +1,2 @@ +Structure of the prompt +Let's take a closer look at how the prompt is structured to understand how it can be best customized. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_90.txt b/chunked/nltk_chunking/_custom_tools/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..86b3eddafef1d22151d6cbe2f0b1067b6bc9aba7 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_90.txt @@ -0,0 +1 @@ +That looks more like what we want. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_91.txt b/chunked/nltk_chunking/_custom_tools/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..91d945622982117a1abc13d89108f9afe3407b85 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_91.txt @@ -0,0 +1,3 @@ +In short, when you notice that the agent struggles to +correctly map your task to the correct tools, try looking up the most pertinent keywords of the tool's name +and description and try refining your task request with it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_92.txt b/chunked/nltk_chunking/_custom_tools/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e5d064d0725311b1c08519218c734d6c8c5c00 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_92.txt @@ -0,0 +1,2 @@ +Customizing the tool descriptions +As we've seen before the agent has access to each of the tools' names and descriptions. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_93.txt b/chunked/nltk_chunking/_custom_tools/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..74a15c16a5265048adffabc67dd7576f1fdeaf47 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_93.txt @@ -0,0 +1,3 @@ +The base tools +should have very precise names and descriptions, however, you might find that it could help to change the +the description or name of a tool for your specific use case. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_94.txt b/chunked/nltk_chunking/_custom_tools/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..23a5f3a3839b587304b8b2c33ec3f7146797d63b --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_94.txt @@ -0,0 +1,3 @@ +This might become especially important +when you've added multiple tools that are very similar or if you want to use your agent only for a certain +domain, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_95.txt b/chunked/nltk_chunking/_custom_tools/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..4970a72244896f1fee27f2774c02ddbdf0b55a07 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_95.txt @@ -0,0 +1 @@ +image generation and transformations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_96.txt b/chunked/nltk_chunking/_custom_tools/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..001abd454487b6f0cbf8293c48d1685f697b4b52 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_96.txt @@ -0,0 +1,2 @@ +A common problem is that the agent confuses image generation with image transformation/modification when +used a lot for image generation tasks, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_97.txt b/chunked/nltk_chunking/_custom_tools/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..796e033452a781254ef743e2108020134b68f6a5 --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_97.txt @@ -0,0 +1,6 @@ +py +agent.run("Make an image of a house and a car", return_code=True) +returns +``text +==Explanation from the agent== +I will use the following toolsimage_generatorto generate an image of a house andimage_transformer` to transform the image of a car into the image of a house. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_98.txt b/chunked/nltk_chunking/_custom_tools/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..62c95891ff37361787381e269c58f710e62de0bd --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_98.txt @@ -0,0 +1,6 @@ +==Code generated by the agent== +house_image = image_generator(prompt="A house") +car_image = image_generator(prompt="A car") +house_car_image = image_transformer(image=car_image, prompt="A house") + +which is probably not exactly what we want here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_custom_tools/chunk_99.txt b/chunked/nltk_chunking/_custom_tools/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ac83f1fa40f94b1b43f64fff8fe92e49d53c9ef --- /dev/null +++ b/chunked/nltk_chunking/_custom_tools/chunk_99.txt @@ -0,0 +1,2 @@ +It seems like the agent has a difficult time +to understand the difference between image_generator and image_transformer and often uses the two together. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_0.txt b/chunked/nltk_chunking/_debugging/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c7dd018706a2b92c0ba9f7086f530fdd3dc0976 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_0.txt @@ -0,0 +1,3 @@ + +Debugging +Training on multiple GPUs can be a tricky endeavor whether you're running into installation issues or communication problems between your GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_1.txt b/chunked/nltk_chunking/_debugging/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d97e318cab064575b33d408f54ce46b5e5f878c --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_1.txt @@ -0,0 +1 @@ +This debugging guide covers some issues you may run into and how to resolve them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_10.txt b/chunked/nltk_chunking/_debugging/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..fee55cd50c0906daf02a4a9ba49bf6fd6d7163b2 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_10.txt @@ -0,0 +1,5 @@ +When CUDA is correctly setup and added to your PATH environment variable, you can find the installation location with the following command: + +which nvcc +Multiple CUDA toolkits +You may also have more than one CUDA toolkit installed system-wide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_11.txt b/chunked/nltk_chunking/_debugging/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..2620456e056cc7bbc01043cf00ff86c39520a9d1 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_11.txt @@ -0,0 +1,3 @@ +/usr/local/cuda-10.2 +/usr/local/cuda-11.0 +Typically, package installers set the paths to whatever the last version was installed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_12.txt b/chunked/nltk_chunking/_debugging/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..393af3ab4cc8620f7f847e7b2a7fc29ace2e547d --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_12.txt @@ -0,0 +1 @@ +If the package build fails because it can't find the right CUDA version (despite it being installed system-wide already), then you need to configure the PATH and LD_LIBRARY_PATH environment variables to point to the correct path. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_13.txt b/chunked/nltk_chunking/_debugging/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..341e537f3abff20129ffcc7b53020df02ffaf1ee --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_13.txt @@ -0,0 +1,5 @@ +Take a look at the contents of these environment variables first: + +echo $PATH +echo $LD_LIBRARY_PATH +PATH lists the locations of the executables and LD_LIBRARY_PATH lists where to look for shared libraries. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_14.txt b/chunked/nltk_chunking/_debugging/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..ddd9cc3aa773ca48ad593203aea59ac07d23362b --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_14.txt @@ -0,0 +1 @@ +Earlier entries are prioritized over later ones, and : is used to separate multiple entries. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_15.txt b/chunked/nltk_chunking/_debugging/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a0226637abb4e018c163ef27782f70f52018724 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_15.txt @@ -0,0 +1 @@ +To tell the build program where to find the specific CUDA toolkit you want, insert the correct path to list first. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_16.txt b/chunked/nltk_chunking/_debugging/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8d3e6e1b2a8ed3dbb6185b64cbc3050b8e37e80 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_16.txt @@ -0,0 +1 @@ +This command prepends rather than overwrites the existing values. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_17.txt b/chunked/nltk_chunking/_debugging/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a7147629960c86081d1166e0b42beae4eeb8f4c --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_17.txt @@ -0,0 +1,6 @@ +```bash +adjust the version and full path if needed +export PATH=/usr/local/cuda-10.2/bin:$PATH +export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH + +In addition, you should also check the directories you assign actually exist. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_18.txt b/chunked/nltk_chunking/_debugging/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe027e535e9f2be930490ad3f9f3b9d4a05b877e --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_18.txt @@ -0,0 +1 @@ +The lib64 sub-directory contains various CUDA .so objects (like libcudart.so) and while it is unlikely your system names them differently, you should check the actual names and change them accordingly. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_19.txt b/chunked/nltk_chunking/_debugging/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f98a58f403e2836a9ac71f697f2c7bd569ff538 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_19.txt @@ -0,0 +1,2 @@ +Older CUDA versions +Sometimes, older CUDA versions may refuse to build with newer compilers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_2.txt b/chunked/nltk_chunking/_debugging/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d6b1a9087e67c181f8f31b704af00ebffd819a0 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_2.txt @@ -0,0 +1,2 @@ +DeepSpeed CUDA installation +If you're using DeepSpeed, you've probably already installed it with the following command. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_20.txt b/chunked/nltk_chunking/_debugging/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..8beba95be94e09a962edbd74a6ab44ad32ffecb4 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_20.txt @@ -0,0 +1 @@ +For example, if you have gcc-9 but CUDA wants gcc-7. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_21.txt b/chunked/nltk_chunking/_debugging/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f66730d31c82f23dae26155409f5a2d11e84467 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_21.txt @@ -0,0 +1 @@ +Usually, installing the latest CUDA toolkit enables support for the newer compiler. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_22.txt b/chunked/nltk_chunking/_debugging/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab868fb54c15d09c22fddcc48a40c4025fd836ea --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_22.txt @@ -0,0 +1 @@ +You could also install an older version of the compiler in addition to the one you're currently using (or it may already be installed but it's not used by default and the build system can't see it). \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_23.txt b/chunked/nltk_chunking/_debugging/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..b061fe3f26f60be4cdc6539c72b7d083d1a4b7e3 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_23.txt @@ -0,0 +1 @@ +To resolve this, you can create a symlink to give the build system visibility to the older compiler. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_24.txt b/chunked/nltk_chunking/_debugging/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..03bb2eee79ef3e360b30a5d9d9a5cf5e7ff28488 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_24.txt @@ -0,0 +1,7 @@ +```bash +adapt the path to your system +sudo ln -s /usr/bin/gcc-7 /usr/local/cuda-10.2/bin/gcc +sudo ln -s /usr/bin/g++-7 /usr/local/cuda-10.2/bin/g++ + +Prebuild +If you're still having issues with installing DeepSpeed or if you're building DeepSpeed at run time, you can try to prebuild the DeepSpeed modules before installing them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_25.txt b/chunked/nltk_chunking/_debugging/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1c783af3c1ae414565d1f9e00caac63d15cbdc5 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_25.txt @@ -0,0 +1,6 @@ +To make a local build for DeepSpeed: + +git clone https://github.com/microsoft/DeepSpeed/ +cd DeepSpeed +rm -rf build +TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_26.txt b/chunked/nltk_chunking/_debugging/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..886755e992f1a439ca0f8baaa6531ce0dcee29fa --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_26.txt @@ -0,0 +1,5 @@ +\ +--global-option="build_ext" --global-option="-j8" --no-cache -v \ +--disable-pip-version-check 2>&1 | tee build.log + +To use NVMe offload, add the DS_BUILD_AIO=1 parameter to the build command and make sure you install the libaio-dev package system-wide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_27.txt b/chunked/nltk_chunking/_debugging/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9f4ffb4cf608ed7ef603a06e64a1a293f3e4d19 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_27.txt @@ -0,0 +1 @@ +Next, you'll have to specify your GPU's architecture by editing the TORCH_CUDA_ARCH_LIST variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this page). \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_28.txt b/chunked/nltk_chunking/_debugging/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1c59591e098591a906bacab08cf30b35d7499cd --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_28.txt @@ -0,0 +1,13 @@ +To check the PyTorch version that corresponds to your architecture, run the following command: + +python -c "import torch; print(torch.cuda.get_arch_list())" +Find the architecture for a GPU with the following command: + +CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capability())" + +To find the architecture for GPU 0: + +CUDA_VISIBLE_DEVICES=0 python -c "import torch; \ +print(torch.cuda.get_device_properties(torch.device('cuda'))) +"_CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)" +This means your GPU architecture is 8.6. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_29.txt b/chunked/nltk_chunking/_debugging/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e7b9435de8a28dfead19a4caba5e60f3f485b54 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_29.txt @@ -0,0 +1 @@ +If you get 8, 6, then you can set TORCH_CUDA_ARCH_LIST="8.6". \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_3.txt b/chunked/nltk_chunking/_debugging/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9211aeb781188fc9aebecf9289ffcfbf07346ab --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_3.txt @@ -0,0 +1,2 @@ +pip install deepspeed +DeepSpeed compiles CUDA C++ code and it can be a potential source of errors when building PyTorch extensions that require CUDA. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_30.txt b/chunked/nltk_chunking/_debugging/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..24f74f5a4c15a649054bd76c6a1f3c29873f4fc9 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_30.txt @@ -0,0 +1 @@ +For multiple GPUs with different architectures, list them like TORCH_CUDA_ARCH_LIST="6.1;8.6". \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_31.txt b/chunked/nltk_chunking/_debugging/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..b21b3e5da2faae7589e61930f334a9561cf03f56 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_31.txt @@ -0,0 +1 @@ +It is also possible to not specify TORCH_CUDA_ARCH_LIST and the build program automatically queries the GPU architecture of the build. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_32.txt b/chunked/nltk_chunking/_debugging/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8dcad02cb91b510f2aad589a867d1ad1b5dcad8 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_32.txt @@ -0,0 +1 @@ +However, it may or may not match the actual GPU on the target machine which is why it is better to explicitly specify the correct architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_33.txt b/chunked/nltk_chunking/_debugging/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..60373010390780941453e529109f14157bfbd05e --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_33.txt @@ -0,0 +1,8 @@ +For training on multiple machines with the same setup, you'll need to make a binary wheel: + +git clone https://github.com/microsoft/DeepSpeed/ +cd DeepSpeed +rm -rf build +TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \ +python setup.py build_ext -j8 bdist_wheel +This command generates a binary wheel that'll look something like dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_34.txt b/chunked/nltk_chunking/_debugging/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..94b0963c4b88903e6c2fd8f0cde6983acd735d41 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_34.txt @@ -0,0 +1 @@ +Now you can install this wheel locally or on another machine. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_35.txt b/chunked/nltk_chunking/_debugging/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1ddba78eeb0e5823c0a7b4dae907e1db9395f53 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_35.txt @@ -0,0 +1,3 @@ +pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl +Multi-GPU Network Issues Debug +When training or inferencing with DistributedDataParallel and multiple GPU, if you run into issue of inter-communication between processes and/or nodes, you can use the following script to diagnose network issues. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_36.txt b/chunked/nltk_chunking/_debugging/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..c064d37e272ac74181f0fccf600ac3c25e0219bb --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_36.txt @@ -0,0 +1,5 @@ +wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py +For example to test how 2 GPUs interact do: + +python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +If both processes can talk to each and allocate GPU memory each will print an OK status. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_37.txt b/chunked/nltk_chunking/_debugging/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..e029fa56c241877b15e63139d261a8d2c6713ca3 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_37.txt @@ -0,0 +1 @@ +For more GPUs or nodes adjust the arguments in the script. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_38.txt b/chunked/nltk_chunking/_debugging/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d7ae6dc5ea5ba599c61964e80c39bdb35b15f9d --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_38.txt @@ -0,0 +1 @@ +You will find a lot more details inside the diagnostics script and even a recipe to how you could run it in a SLURM environment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_39.txt b/chunked/nltk_chunking/_debugging/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f84e2c8aa0298a92afda4fab0e96be3344d4b2a --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_39.txt @@ -0,0 +1,4 @@ +An additional level of debug is to add NCCL_DEBUG=INFO environment variable as follows: + +NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py +This will dump a lot of NCCL-related debug information, which you can then search online if you find that some problems are reported. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_4.txt b/chunked/nltk_chunking/_debugging/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..86a1329e7c594369ca4aeb11f90c7a9ac4c4a24e --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_4.txt @@ -0,0 +1 @@ +These errors depend on how CUDA is installed on your system, and this section focuses on PyTorch built with CUDA 10.2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_40.txt b/chunked/nltk_chunking/_debugging/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..c44ab10ca439754ff47a587c12b030233ffb6193 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_40.txt @@ -0,0 +1 @@ +Or if you're not sure how to interpret the output you can share the log file in an Issue. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_41.txt b/chunked/nltk_chunking/_debugging/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d34b4234457572b09ae735748516494e54dc1a3 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_41.txt @@ -0,0 +1,3 @@ +Underflow and Overflow Detection + +This feature is currently available for PyTorch-only. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_42.txt b/chunked/nltk_chunking/_debugging/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..77736b8e3c99b062f9e7c1a3e97a5e5a87a879ce --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_42.txt @@ -0,0 +1 @@ +For multi-GPU training it requires DDP (torch.distributed.launch). \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_43.txt b/chunked/nltk_chunking/_debugging/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..719d288c9055088e86a1bce2e4046cc96c32f561 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_43.txt @@ -0,0 +1 @@ +This feature can be used with any nn.Module-based model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_44.txt b/chunked/nltk_chunking/_debugging/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4251dbe346d645107e17aed12ae92134501f73b --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_44.txt @@ -0,0 +1,2 @@ +If you start getting loss=NaN or the model inhibits some other abnormal behavior due to inf or nan in +activations or weights one needs to discover where the first underflow or overflow happens and what led to it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_45.txt b/chunked/nltk_chunking/_debugging/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7fc249eb79d4e4dec0a53a91b2b925f31ae74d9 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_45.txt @@ -0,0 +1,2 @@ +Luckily +you can accomplish that easily by activating a special module that will do the detection automatically. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_46.txt b/chunked/nltk_chunking/_debugging/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..5dc3d577aba6f58a005e2785c0fec8ac6b5ea083 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_46.txt @@ -0,0 +1,5 @@ +If you're using [Trainer], you just need to add: + +--debug underflow_overflow +to the normal command line arguments, or pass debug="underflow_overflow" when creating the +[TrainingArguments] object. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_47.txt b/chunked/nltk_chunking/_debugging/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..08d2dd496e4e02331fdd315d5b5efb6d092eb14d --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_47.txt @@ -0,0 +1,7 @@ +If you're using your own training loop or another Trainer you can accomplish the same with: +thon +from transformers.debug_utils import DebugUnderflowOverflow +debug_overflow = DebugUnderflowOverflow(model) + +[~debug_utils.DebugUnderflowOverflow] inserts hooks into the model that immediately after each +forward call will test input and output variables and also the corresponding module's weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_48.txt b/chunked/nltk_chunking/_debugging/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f5fc401931159227341272ecd3ee591d5a76a3d --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_48.txt @@ -0,0 +1,41 @@ +As soon as inf or +nan is detected in at least one element of the activations or weights, the program will assert and print a report +like this (this was caught with google/mt5-small under fp16 mixed precision): +Detected inf/nan during batch_number=0 +Last 21 forward frames: +abs min abs max metadata + encoder.block.1.layer.1.DenseReluDense.dropout Dropout +0.00e+00 2.57e+02 input[0] +0.00e+00 2.85e+02 output +[] + encoder.block.2.layer.0 T5LayerSelfAttention +6.78e-04 3.15e+03 input[0] +2.65e-04 3.42e+03 output[0] + None output[1] +2.25e-01 1.00e+04 output[2] + encoder.block.2.layer.1.layer_norm T5LayerNorm +8.69e-02 4.18e-01 weight +2.65e-04 3.42e+03 input[0] +1.79e-06 4.65e+00 output + encoder.block.2.layer.1.DenseReluDense.wi_0 Linear +2.17e-07 4.50e+00 weight +1.79e-06 4.65e+00 input[0] +2.68e-06 3.70e+01 output + encoder.block.2.layer.1.DenseReluDense.wi_1 Linear +8.08e-07 2.66e+01 weight +1.79e-06 4.65e+00 input[0] +1.27e-04 2.37e+02 output + encoder.block.2.layer.1.DenseReluDense.dropout Dropout +0.00e+00 8.76e+03 input[0] +0.00e+00 9.74e+03 output + encoder.block.2.layer.1.DenseReluDense.wo Linear +1.01e-06 6.44e+00 weight +0.00e+00 9.74e+03 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense +1.79e-06 4.65e+00 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.dropout Dropout +3.18e-04 6.27e+04 input[0] +0.00e+00 inf output +The example output has been trimmed in the middle for brevity. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_49.txt b/chunked/nltk_chunking/_debugging/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea251f5f55cdd6c022ba32acec4d66efcfa80a34 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_49.txt @@ -0,0 +1,2 @@ +The second column shows the value of the absolute largest element, so if you have a closer look at the last few frames, +the inputs and outputs were in the range of 1e4. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_5.txt b/chunked/nltk_chunking/_debugging/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa0fca958a58ee3cbc3bcd5f6bad0c5a4126153c --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_5.txt @@ -0,0 +1 @@ +For any other installation issues, please open an issue with the DeepSpeed team. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_50.txt b/chunked/nltk_chunking/_debugging/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..15f3a439e75f70abfa11302b03c8f41294a8cb83 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_50.txt @@ -0,0 +1,2 @@ +So when this training was done under fp16 mixed precision the very +last step overflowed (since under fp16 the largest number before inf is 64e3). \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_51.txt b/chunked/nltk_chunking/_debugging/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..440a92f573dc79d68534c2d18a589755d8e8aa03 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_51.txt @@ -0,0 +1,3 @@ +To avoid overflows under +fp16 the activations must remain way below 1e4, because 1e4 * 1e4 = 1e8 so any matrix multiplication with +large activations is going to lead to a numerical overflow condition. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_52.txt b/chunked/nltk_chunking/_debugging/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..9844958b972f297cef4f420c0a4ba4ccd2aba7de --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_52.txt @@ -0,0 +1 @@ +At the very start of the trace you can discover at which batch number the problem occurred (here Detected inf/nan during batch_number=0 means the problem occurred on the first batch). \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_53.txt b/chunked/nltk_chunking/_debugging/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..b66ed0b40f364aec404260d99b0db6d18534e47f --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_53.txt @@ -0,0 +1,2 @@ +Each reported frame starts by declaring the fully qualified entry for the corresponding module this frame is reporting +for. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_54.txt b/chunked/nltk_chunking/_debugging/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d9951c9c753b912e5ede9233e9fdee9ff113902 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_54.txt @@ -0,0 +1,7 @@ +If we look just at this frame: +encoder.block.2.layer.1.layer_norm T5LayerNorm +8.69e-02 4.18e-01 weight +2.65e-04 3.42e+03 input[0] +1.79e-06 4.65e+00 output +Here, encoder.block.2.layer.1.layer_norm indicates that it was a layer norm for the first layer, of the second +block of the encoder. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_55.txt b/chunked/nltk_chunking/_debugging/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c5ca2ce9c239ae5c1eca1723a15fa3771a8a87c --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_55.txt @@ -0,0 +1 @@ +And the specific calls of the forward is T5LayerNorm. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_56.txt b/chunked/nltk_chunking/_debugging/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ab978abab066bd337e95f878b15817db3c04273 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_56.txt @@ -0,0 +1,25 @@ +Let's look at the last few frames of that report: +Detected inf/nan during batch_number=0 +Last 21 forward frames: +abs min abs max metadata +[] + encoder.block.2.layer.1.DenseReluDense.wi_0 Linear +2.17e-07 4.50e+00 weight +1.79e-06 4.65e+00 input[0] +2.68e-06 3.70e+01 output + encoder.block.2.layer.1.DenseReluDense.wi_1 Linear +8.08e-07 2.66e+01 weight +1.79e-06 4.65e+00 input[0] +1.27e-04 2.37e+02 output + encoder.block.2.layer.1.DenseReluDense.wo Linear +1.01e-06 6.44e+00 weight +0.00e+00 9.74e+03 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense +1.79e-06 4.65e+00 input[0] +3.18e-04 6.27e+04 output + encoder.block.2.layer.1.dropout Dropout +3.18e-04 6.27e+04 input[0] +0.00e+00 inf output +The last frame reports for Dropout.forward function with the first entry for the only input and the second for the +only output. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_57.txt b/chunked/nltk_chunking/_debugging/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a1afcbb511f83da0886cab05a519dd6f1b80fa3 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_57.txt @@ -0,0 +1 @@ +You can see that it was called from an attribute dropout inside DenseReluDense class. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_58.txt b/chunked/nltk_chunking/_debugging/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..f09f6eaeb255cb6b3ce1df61c3d7bb885c49889b --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_58.txt @@ -0,0 +1,2 @@ +We can see +that it happened during the first layer, of the 2nd block, during the very first batch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_59.txt b/chunked/nltk_chunking/_debugging/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e6091d0c705e6a3ca84739eadc0b74f50f1f881 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_59.txt @@ -0,0 +1,2 @@ +Finally, the absolute largest +input elements was 6.27e+04 and same for the output was inf. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_6.txt b/chunked/nltk_chunking/_debugging/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..673ab551753c2d7ebb28ec39ed7b6660dd791524 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_6.txt @@ -0,0 +1,2 @@ +Non-identical CUDA toolkits +PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_60.txt b/chunked/nltk_chunking/_debugging/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..1246760d7c554f0cb6bf7d15e414d515be80fdd5 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_60.txt @@ -0,0 +1,2 @@ +You can see here, that T5DenseGatedGeluDense.forward resulted in output activations, whose absolute max value was +around 62.7K, which is very close to fp16's top limit of 64K. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_61.txt b/chunked/nltk_chunking/_debugging/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..377be1a08a98788ef81843a517da95e635537be7 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_61.txt @@ -0,0 +1,3 @@ +In the next frame we have Dropout which renormalizes +the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an +overflow (inf). \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_62.txt b/chunked/nltk_chunking/_debugging/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..552083cae551aa26bd3b61256e50bb6f071c43c1 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_62.txt @@ -0,0 +1,2 @@ +As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16 +numbers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_63.txt b/chunked/nltk_chunking/_debugging/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b0969e30874c91c1e3c4149f668cb3958af3918 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_63.txt @@ -0,0 +1,19 @@ +Let's match the report to the code from models/t5/modeling_t5.py: +thon +class T5DenseGatedGeluDense(nn.Module): + def init(self, config): + super().init() + self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout_rate) + self.gelu_act = ACT2FN["gelu_new"] +def forward(self, hidden_states): + hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states + +Now it's easy to see the dropout call, and all the previous calls as well. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_64.txt b/chunked/nltk_chunking/_debugging/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd3ed07bfc369a83986bf0e13db1854e55ca3517 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_64.txt @@ -0,0 +1,2 @@ +Since the detection is happening in a forward hook, these reports are printed immediately after each forward +returns. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_65.txt b/chunked/nltk_chunking/_debugging/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..c58952804cd478ffa46d3420f398ce54a61c06c9 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_65.txt @@ -0,0 +1,3 @@ +Going back to the full report, to act on it and to fix the problem, we need to go a few frames up where the numbers +started to go up and most likely switch to the fp32 mode here, so that the numbers don't overflow when multiplied +or summed up. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_66.txt b/chunked/nltk_chunking/_debugging/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..934204e08859eea7050b70449f96c3d95a6af032 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_66.txt @@ -0,0 +1 @@ +Of course, there might be other solutions. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_67.txt b/chunked/nltk_chunking/_debugging/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..737d2069d795ce2c2a748c98f76f5d356a5d7b99 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_67.txt @@ -0,0 +1,20 @@ +For example, we could turn off amp temporarily if it's +enabled, after moving the original forward into a helper wrapper, like so: +thon +def _forward(self, hidden_states): + hidden_gelu = self.gelu_act(self.wi_0(hidden_states)) + hidden_linear = self.wi_1(hidden_states) + hidden_states = hidden_gelu * hidden_linear + hidden_states = self.dropout(hidden_states) + hidden_states = self.wo(hidden_states) + return hidden_states +import torch +def forward(self, hidden_states): + if torch.is_autocast_enabled(): + with torch.cuda.amp.autocast(enabled=False): + return self._forward(hidden_states) + else: + return self._forward(hidden_states) + +Since the automatic detector only reports on inputs and outputs of full frames, once you know where to look, you may +want to analyse the intermediary stages of any specific forward function as well. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_68.txt b/chunked/nltk_chunking/_debugging/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..d160d5ff1f15ba3e34777d9476e5628d9ac42d72 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_68.txt @@ -0,0 +1,15 @@ +In such a case you can use the +detect_overflow helper function to inject the detector where you want it, for example: +thon +from debug_utils import detect_overflow +class T5LayerFF(nn.Module): + [] +def forward(self, hidden_states): + forwarded_states = self.layer_norm(hidden_states) + detect_overflow(forwarded_states, "after layer_norm") + forwarded_states = self.DenseReluDense(forwarded_states) + detect_overflow(forwarded_states, "after DenseReluDense") + return hidden_states + self.dropout(forwarded_states) + +You can see that we added 2 of these and now we track if inf or nan for forwarded_states was detected +somewhere in between. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_69.txt b/chunked/nltk_chunking/_debugging/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..403dce3f9caa34032ed7057a869273a682c87142 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_69.txt @@ -0,0 +1,2 @@ +Actually, the detector already reports these because each of the calls in the example above is a nn.Module, but +let's say if you had some local direct calculations this is how you'd do that. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_7.txt b/chunked/nltk_chunking/_debugging/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b20a2cb5e4c2a4c2a80909a0fb7261ea20079e2 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_7.txt @@ -0,0 +1 @@ +For example, if you installed PyTorch with cudatoolkit==10.2 in your Python environment, then you'll also need to have CUDA 10.2 installed system-wide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_70.txt b/chunked/nltk_chunking/_debugging/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..09e93f3435d622cc0cb1ca262eec0391cee8d142 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_70.txt @@ -0,0 +1,2 @@ +Additionally, if you're instantiating the debugger in your own code, you can adjust the number of frames printed from +its default, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_71.txt b/chunked/nltk_chunking/_debugging/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..114730fc7de6442496724627a3cff5f09fa3a022 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_71.txt @@ -0,0 +1,7 @@ +: +thon +from transformers.debug_utils import DebugUnderflowOverflow +debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100) + +Specific batch absolute min and max value tracing +The same debugging class can be used for per-batch tracing with the underflow/overflow detection feature turned off. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_72.txt b/chunked/nltk_chunking/_debugging/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6de98ff64e1221c58d95607cb15b38bb8f6e471 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_72.txt @@ -0,0 +1,2 @@ +Let's say you want to watch the absolute min and max values for all the ingredients of each forward call of a given +batch, and only do that for batches 1 and 3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_73.txt b/chunked/nltk_chunking/_debugging/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..35593fee296d03f9a3604e1cc42447b66c724b67 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_73.txt @@ -0,0 +1,4 @@ +Then you instantiate this class as: +python +debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3]) +And now full batches 1 and 3 will be traced using the same format as the underflow/overflow detector does. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_74.txt b/chunked/nltk_chunking/_debugging/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..e86ff0b3d2a2fe5e0bc420d3ec3e973c9dcc3bd4 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_74.txt @@ -0,0 +1 @@ +Batches are 0-indexed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_75.txt b/chunked/nltk_chunking/_debugging/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..409f5c82dc9402c646f47dd5eb9bf2b54b4b4393 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_75.txt @@ -0,0 +1,2 @@ +This is helpful if you know that the program starts misbehaving after a certain batch number, so you can fast-forward +right to that area. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_76.txt b/chunked/nltk_chunking/_debugging/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..a89e44e6e9339e5acaec68b6b32e3934c78082d3 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_76.txt @@ -0,0 +1,31 @@ +Here is a sample truncated output for such configuration: + + *** Starting batch number=1 *** +abs min abs max metadata + shared Embedding +1.01e-06 7.92e+02 weight +0.00e+00 2.47e+04 input[0] +5.36e-05 7.92e+02 output +[] + decoder.dropout Dropout +1.60e-07 2.27e+01 input[0] +0.00e+00 2.52e+01 output + decoder T5Stack + not a tensor output + lm_head Linear +1.01e-06 7.92e+02 weight +0.00e+00 1.11e+00 input[0] +6.06e-02 8.39e+01 output + T5ForConditionalGeneration + not a tensor output + *** Starting batch number=3 *** + +abs min abs max metadata + shared Embedding +1.01e-06 7.92e+02 weight +0.00e+00 2.78e+04 input[0] +5.36e-05 7.92e+02 output +[] + +Here you will get a huge number of frames dumped - as many as there were forward calls in your model, so it may or may +not what you want, but sometimes it can be easier to use for debugging purposes than a normal debugger. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_77.txt b/chunked/nltk_chunking/_debugging/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c04b91ee4ed793a13a2a6d3221652dd91e3a8ac --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_77.txt @@ -0,0 +1,2 @@ +For example, if +a problem starts happening at batch number 150. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_78.txt b/chunked/nltk_chunking/_debugging/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..67344d79b08f6d379d0d55fbf65be89833fc5c6b --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_78.txt @@ -0,0 +1,2 @@ +So you can dump traces for batches 149 and 150 and compare where +numbers started to diverge. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_79.txt b/chunked/nltk_chunking/_debugging/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1def4733b6f1be695ecb2e8d9f90488c3fdd9b7 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_79.txt @@ -0,0 +1,3 @@ +You can also specify the batch number after which to stop the training, with: +python +debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3) \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_8.txt b/chunked/nltk_chunking/_debugging/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8c3d883b77ef013a5095112ff6b167c9e8cc20c --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_8.txt @@ -0,0 +1 @@ +If you don't have CUDA installed system-wide, you should install it first. \ No newline at end of file diff --git a/chunked/nltk_chunking/_debugging/chunk_9.txt b/chunked/nltk_chunking/_debugging/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..06e56b8aafe75c05101c8e845a622e9f1ddefaf0 --- /dev/null +++ b/chunked/nltk_chunking/_debugging/chunk_9.txt @@ -0,0 +1 @@ +The exact location may vary from system to system, but usr/local/cuda-10.2 is the most common location on many Unix systems. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_0.txt b/chunked/nltk_chunking/_deepspeed/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c442a6e36794e4524ab78bb00754b6d03818775 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_0.txt @@ -0,0 +1,3 @@ + +DeepSpeed +DeepSpeed is a PyTorch optimization library that makes distributed training memory-efficient and fast. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_1.txt b/chunked/nltk_chunking/_deepspeed/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d6ceb801b9659aee1ee6802a9c640ffa3e11019 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_1.txt @@ -0,0 +1 @@ +At it's core is the Zero Redundancy Optimizer (ZeRO) which enables training large models at scale. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_10.txt b/chunked/nltk_chunking/_deepspeed/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bc83d4af36391d708275fe3733119c84486e195 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_10.txt @@ -0,0 +1,6 @@ +pip install deepspeed + +pip install transformers[deepspeed] + +Memory requirements +Before you begin, it is a good idea to check whether you have enough GPU and CPU memory to fit your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_100.txt b/chunked/nltk_chunking/_deepspeed/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd066afd1ac7b765ce7a7ffe5c82d320620af36c --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_100.txt @@ -0,0 +1,3 @@ +To enable this feature: + +For a Hugging Face model, set model.gradient_checkpointing_enable() or --gradient_checkpointing in the [Trainer]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_101.txt b/chunked/nltk_chunking/_deepspeed/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..995e677bb45b22fadd0eda2cadf7a331ca365c4e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_101.txt @@ -0,0 +1 @@ +For a non-Hugging Face model, use the DeepSpeed Activation Checkpointing API. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_102.txt b/chunked/nltk_chunking/_deepspeed/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..859194207c2fdeb7146c234f3d6d01ea564cddaf --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_102.txt @@ -0,0 +1 @@ +You could also replace the Transformers modeling code and replace torch.utils.checkpoint with the DeepSpeed API. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_103.txt b/chunked/nltk_chunking/_deepspeed/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..7aab68dbbfbe3a6c7a6585d556d1a9e30e7c5448 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_103.txt @@ -0,0 +1 @@ +This approach is more flexible because you can offload the forward activations to the CPU memory instead of recalculating them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_104.txt b/chunked/nltk_chunking/_deepspeed/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..925c498422515f0a10b2baefad73c899cc40029b --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_104.txt @@ -0,0 +1,2 @@ +Optimizer and scheduler +DeepSpeed and Transformers optimizer and scheduler can be mixed and matched as long as you don't enable offload_optimizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_105.txt b/chunked/nltk_chunking/_deepspeed/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7b217481f8b78e259918b1e09d8c4662132a305 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_105.txt @@ -0,0 +1 @@ +When offload_optimizer is enabled, you could use a non-DeepSpeed optimizer (except for LAMB) as long as it has both a CPU and GPU implementation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_106.txt b/chunked/nltk_chunking/_deepspeed/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..6768211b0932f6c9628796f90eafac615dc413cf --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_106.txt @@ -0,0 +1 @@ +The optimizer and scheduler parameters for the config file can be set from the command line to avoid hard to find errors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_107.txt b/chunked/nltk_chunking/_deepspeed/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ba7673bc6a92e96e4b920beea2a8d614230dc6b --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_107.txt @@ -0,0 +1 @@ +For example, if the learning rate is set to a different value in another place you can override it from the command line. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_108.txt b/chunked/nltk_chunking/_deepspeed/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..319db7e93ad59888a79a328918c6e650b16e9a12 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_108.txt @@ -0,0 +1 @@ +Aside from the optimizer and scheduler parameters, you'll need to ensure your [Trainer] command line arguments match the DeepSpeed configuration. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_109.txt b/chunked/nltk_chunking/_deepspeed/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..2802f420df07c241259cb33895fa102a3d06afe1 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_109.txt @@ -0,0 +1 @@ +DeepSpeed offers several optimizers (Adam, AdamW, OneBitAdam, and LAMB) but you can also import other optimizers from PyTorch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_11.txt b/chunked/nltk_chunking/_deepspeed/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..e214876f07775230585222b471ef311c65696523 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_11.txt @@ -0,0 +1 @@ +DeepSpeed provides a tool for estimating the required CPU/GPU memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_110.txt b/chunked/nltk_chunking/_deepspeed/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..3220850784a489cc6b029a65dd84b5a7b0c0cb86 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_110.txt @@ -0,0 +1 @@ +If you don't configure the optimizer in the config, the [Trainer] automatically selects AdamW and either uses the supplied values or the default values for the following parameters from the command line: lr, adam_beta1, adam_beta2, adam_epsilon, weight_decay. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_111.txt b/chunked/nltk_chunking/_deepspeed/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..086893c14dcbdd58458950634bdbdb1f3d81c9da --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_111.txt @@ -0,0 +1 @@ +You can set the parameters to "auto" or manually input your own desired values. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_112.txt b/chunked/nltk_chunking/_deepspeed/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..7bba430de8144803f61de298d41c25bee1894193 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_112.txt @@ -0,0 +1,13 @@ +yaml +{ + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } + } +} +You can also use an unsupported optimizer by adding the following to the top level configuration. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_113.txt b/chunked/nltk_chunking/_deepspeed/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f9d53668a07567279e2d1c68014251857d5cdbd --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_113.txt @@ -0,0 +1,5 @@ +yaml +{ + "zero_allow_untested_optimizer": true +} +From DeepSpeed==0.8.3 on, if you want to use offload, you'll also need to the following to the top level configuration because offload works best with DeepSpeed's CPU Adam optimizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_114.txt b/chunked/nltk_chunking/_deepspeed/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..98947116029f97f100beec3e121c75ec23bc3ae1 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_114.txt @@ -0,0 +1,6 @@ +yaml +{ + "zero_force_ds_cpu_optimizer": false +} + +DeepSpeed supports the LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR learning rate schedulers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_115.txt b/chunked/nltk_chunking/_deepspeed/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..055f9385d5f59c2c228936bf86f1082d2642be7b --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_115.txt @@ -0,0 +1,6 @@ +Transformers and DeepSpeed provide two of the same schedulers: + +WarmupLR is the same as --lr_scheduler_type constant_with_warmup in Transformers +WarmupDecayLR is the same as --lr_scheduler_type linear in Transformers (this is the default scheduler used in Transformers) + +If you don't configure the scheduler in the config, the [Trainer] automatically selects WarmupDecayLR and either uses the supplied values or the default values for the following parameters from the command line: warmup_min_lr, warmup_max_lr, warmup_num_steps, total_num_steps (automatically calculated during run time if max_steps is not provided). \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_116.txt b/chunked/nltk_chunking/_deepspeed/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..086893c14dcbdd58458950634bdbdb1f3d81c9da --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_116.txt @@ -0,0 +1 @@ +You can set the parameters to "auto" or manually input your own desired values. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_117.txt b/chunked/nltk_chunking/_deepspeed/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..6bc08acf6880006550a6aaf18824d7db657bb23a --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_117.txt @@ -0,0 +1,15 @@ +yaml +{ + "scheduler": { + "type": "WarmupDecayLR", + "params": { + "total_num_steps": "auto", + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } + } +} + +Precision +Deepspeed supports fp32, fp16, and bf16 mixed precision. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_118.txt b/chunked/nltk_chunking/_deepspeed/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..7afc3facf6187ba6340c9954c24900c72cb8240c --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_118.txt @@ -0,0 +1 @@ +If your model doesn't work well with mixed precision, for example if it wasn't pretrained in mixed precision, you may encounter overflow or underflow issues which can cause NaN loss. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_119.txt b/chunked/nltk_chunking/_deepspeed/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b939bf5f5e05dd4f5dd97a31753a8a9ef5d40ec --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_119.txt @@ -0,0 +1 @@ +For these cases, you should use full fp32 precision by explicitly disabling the default fp16 mode. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_12.txt b/chunked/nltk_chunking/_deepspeed/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7b766349e833d780634a0f65d968357978cb041 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_12.txt @@ -0,0 +1,9 @@ +For example, to estimate the memory requirements for the bigscience/T0_3B model on a single GPU: + +$ python -c 'from transformers import AutoModel; \ +from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \ +model = AutoModel.from_pretrained("bigscience/T0_3B"); \ +estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)' +[] +Estimated memory needed for params, optim states and gradients for a: +HW: Setup with 1 node, 1 GPU per node. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_120.txt b/chunked/nltk_chunking/_deepspeed/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a10ca23dabf926a21deacdcc1cd009707c97708 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_120.txt @@ -0,0 +1,7 @@ +yaml +{ + "fp16": { + "enabled": false + } +} +For Ampere GPUs and PyTorch > 1.7, it automatically switches to the more efficient tf32 format for some operations but the results are still in fp32. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_121.txt b/chunked/nltk_chunking/_deepspeed/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..77dc85a35b2e99816cd7099e1018af4268ed4354 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_121.txt @@ -0,0 +1 @@ +You can control it from the [Trainer] by setting --tf32 to enable it, and --tf32 0 or --no_tf32 to disable it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_122.txt b/chunked/nltk_chunking/_deepspeed/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..845b53006f266209b8d3b1fd1c46239ea7297474 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_122.txt @@ -0,0 +1 @@ +To configure PyTorch AMP-like fp16 mixed precision reduces memory usage and accelerates training speed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_123.txt b/chunked/nltk_chunking/_deepspeed/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..d41b8c5eb903623dc195578591e1fd6264f4a52e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_123.txt @@ -0,0 +1 @@ +[Trainer] automatically enables or disables fp16 based on the value of args.fp16_backend, and the rest of the config can be set by you. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_124.txt b/chunked/nltk_chunking/_deepspeed/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..21e9a1b99989a49511d1f8df38cd2c76f77c06fa --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_124.txt @@ -0,0 +1 @@ +fp16 is enabled from the command line when the following arguments are passed: --fp16, --fp16_backend amp or --fp16_full_eval. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_125.txt b/chunked/nltk_chunking/_deepspeed/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..bde5d8ef409331cf4a7315c2ef7645227d584636 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_125.txt @@ -0,0 +1,12 @@ +yaml +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + } +} +For additional DeepSpeed fp16 training options, take a look at the FP16 Training Options reference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_126.txt b/chunked/nltk_chunking/_deepspeed/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..835f4eb830f43f056662ac8f064891e79cec1e74 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_126.txt @@ -0,0 +1 @@ +To configure Apex-like fp16 mixed precision, setup the config as shown below with "auto" or your own values. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_127.txt b/chunked/nltk_chunking/_deepspeed/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d10bec138666df2623f13e3889e2df91b02f238 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_127.txt @@ -0,0 +1 @@ +[Trainer] automatically configure amp based on the values of args.fp16_backend and args.fp16_opt_level. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_128.txt b/chunked/nltk_chunking/_deepspeed/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..216558275295fc0723fca0c1358539eb13129428 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_128.txt @@ -0,0 +1,10 @@ +It can also be enabled from the command line when the following arguments are passed: --fp16, --fp16_backend apex or --fp16_opt_level 01. +yaml +{ + "amp": { + "enabled": "auto", + "opt_level": "auto" + } +} + +To use bf16, you'll need at least DeepSpeed==0.6.0. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_129.txt b/chunked/nltk_chunking/_deepspeed/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc26f17c533469df2d367f0ed390e1f72f0eb2f5 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_129.txt @@ -0,0 +1 @@ +bf16 has the same dynamic range as fp32 and doesn’t require loss scaling. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_13.txt b/chunked/nltk_chunking/_deepspeed/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c3a81fd0ff6348fcd67f20966e1d912181c28384 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_13.txt @@ -0,0 +1 @@ +SW: Model with 2783M total params, 65M largest layer params. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_130.txt b/chunked/nltk_chunking/_deepspeed/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..dafe0cd2de214997e7a87fffc8b2eeff503d56ae --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_130.txt @@ -0,0 +1 @@ +However, if you use gradient accumulation with bf16, gradients are accumulated in bf16 which may not be desired because this format's low precision can lead to lossy accumulation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_131.txt b/chunked/nltk_chunking/_deepspeed/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..0df203aac9e3316d2a53a68274aa2b8f4c8fabf9 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_131.txt @@ -0,0 +1 @@ +bf16 can be setup in the config file or enabled from the command line when the following arguments are passed: --bf16 or --bf16_full_eval. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_132.txt b/chunked/nltk_chunking/_deepspeed/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..6853bbe7248af723a86b5d7a1ba47a794373ac74 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_132.txt @@ -0,0 +1,9 @@ +yaml +{ + "bf16": { + "enabled": "auto" + } +} + +Batch size +The batch size can be auto-configured or explicitly set. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_133.txt b/chunked/nltk_chunking/_deepspeed/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..8fdba0118a88369f63cdd15a35866c7f954d3ef6 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_133.txt @@ -0,0 +1 @@ +If you choose to use the "auto" option, [Trainer] sets train_micro_batch_size_per_gpu to the value of args.per_device_train_batch_size and train_batch_size to args.world_size * args.per_device_train_batch_size * args.gradient_accumulation_steps. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_134.txt b/chunked/nltk_chunking/_deepspeed/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a90a6d659eb0d5a23c195771067ca46bd5c5438 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_134.txt @@ -0,0 +1,7 @@ +yaml +{ + "train_micro_batch_size_per_gpu": "auto", + "train_batch_size": "auto" +} +Gradient accumulation +Gradient accumulation can be auto-configured or explicitly set. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_135.txt b/chunked/nltk_chunking/_deepspeed/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..05323339dae0ac7687ac11712bae77b072893c3e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_135.txt @@ -0,0 +1 @@ +If you choose to use the "auto" option, [Trainer] sets it to the value of args.gradient_accumulation_steps. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_136.txt b/chunked/nltk_chunking/_deepspeed/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5d9d5bf7269c1c66fcbcb09bd2242f71fee3326 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_136.txt @@ -0,0 +1,7 @@ +```yaml +{ + "gradient_accumulation_steps": "auto" +} + +Gradient clipping +Gradient clipping can be auto-configured or explicitly set. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_137.txt b/chunked/nltk_chunking/_deepspeed/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..945962cf02e7c6890d872fb9be23f39334a1d341 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_137.txt @@ -0,0 +1 @@ +If you choose to use the "auto" option, [Trainer] sets it to the value of args.max_grad_norm. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_138.txt b/chunked/nltk_chunking/_deepspeed/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb2411657ee8dd35c20224e5855c76e2f1972697 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_138.txt @@ -0,0 +1,6 @@ +yaml +{ + "gradient_clipping": "auto" +} +Communication data type +For communication collectives like reduction, gathering and scattering operations, a separate data type is used. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_139.txt b/chunked/nltk_chunking/_deepspeed/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..da405ed0f2934113617e9c4e8c0423e7395d0661 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_139.txt @@ -0,0 +1 @@ +All gather and scatter operations are performed in the same data type the data is in. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_14.txt b/chunked/nltk_chunking/_deepspeed/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c52880f0c2709f490d735d4b4ef6ced642c3018 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_14.txt @@ -0,0 +1,8 @@ +per CPU | per GPU | Options + 70.00GB | 0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1 + 70.00GB | 0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0 + 62.23GB | 5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=1 + 62.23GB | 5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=0 + 0.37GB | 46.91GB | offload_param=none, offload_optimizer=none, zero_init=1 + 15.56GB | 46.91GB | offload_param=none, offload_optimizer=none, zero_init=0 +This means you either need a single 80GB GPU without CPU offload or a 8GB GPU and a ~60GB CPU to offload to (these are just the memory requirements for the parameters, optimizer states and gradients, and you'll need a bit more for the CUDA kernels and activations). \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_140.txt b/chunked/nltk_chunking/_deepspeed/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..8bfed7876df8b4d0fb81cd6e72bfa6d03448c7fa --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_140.txt @@ -0,0 +1 @@ +For example, if you're training with bf16, the data is also gathered in bf16 because gathering is a non-lossy operation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_141.txt b/chunked/nltk_chunking/_deepspeed/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e74ac9836128c3b96070afcf93d9d0307f26b0e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_141.txt @@ -0,0 +1 @@ +Reduce operations are lossy, for example when gradients are averaged across multiple GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_142.txt b/chunked/nltk_chunking/_deepspeed/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..497a0514c381b5531bbdc84ea979b45f28ed841b --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_142.txt @@ -0,0 +1 @@ +When the communication is done in fp16 or bf16, it is more likely to be lossy because adding multiple numbers in low precision isn't exact. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_143.txt b/chunked/nltk_chunking/_deepspeed/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..386810deee25b18c57554b9ac286e9ff1f57de74 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_143.txt @@ -0,0 +1 @@ +This is especially the case with bf16 which has a lower precision than fp16. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_144.txt b/chunked/nltk_chunking/_deepspeed/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..262042d4bf4592b55d87ebfbc8f2a66ff7d7eca9 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_144.txt @@ -0,0 +1 @@ +For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_145.txt b/chunked/nltk_chunking/_deepspeed/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..05c8f8dcb484a677ab0274df3dc24730f432cca3 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_145.txt @@ -0,0 +1 @@ +You can choose the communication data type by setting the communication_data_type parameter in the config file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_146.txt b/chunked/nltk_chunking/_deepspeed/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..55f4470e94c6e8cce5dcdd06cb40ce1049197128 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_146.txt @@ -0,0 +1 @@ +For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it is downcasted to whichever half-precision dtype you're training in. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_147.txt b/chunked/nltk_chunking/_deepspeed/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad428106230be7a998fb85802fa6f853a959dc75 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_147.txt @@ -0,0 +1,6 @@ +yaml +{ + "communication_data_type": "fp32" +} +Deployment +DeepSpeed can be deployed by different launchers such as torchrun, the deepspeed launcher, or Accelerate. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_148.txt b/chunked/nltk_chunking/_deepspeed/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f279ac48604cd3782a4dd8c1cb161a2874c25cb --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_148.txt @@ -0,0 +1 @@ +To deploy, add --deepspeed ds_config.json to the [Trainer] command line. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_149.txt b/chunked/nltk_chunking/_deepspeed/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..402e9c66b46bd4375dfbe3c766d34ae56fb15400 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_149.txt @@ -0,0 +1 @@ +It’s recommended to use DeepSpeed’s add_config_arguments utility to add any necessary command line arguments to your code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_15.txt b/chunked/nltk_chunking/_deepspeed/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..60659bde3a1fdfd61c3dbbb6d17b4a7d6806a4fc --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_15.txt @@ -0,0 +1 @@ +You should also consider the tradeoff between cost and speed because it'll be cheaper to rent or buy a smaller GPU but it'll take longer to train your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_150.txt b/chunked/nltk_chunking/_deepspeed/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..765871608f0ecfaeb14b2b129e8d3c83485356b7 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_150.txt @@ -0,0 +1 @@ +This guide will show you how to deploy DeepSpeed with the deepspeed launcher for different training setups. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_151.txt b/chunked/nltk_chunking/_deepspeed/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f33e96ded28f3b527317051fe5b938ce8944f4c --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_151.txt @@ -0,0 +1 @@ +You can check out this post for more practical usage examples. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_152.txt b/chunked/nltk_chunking/_deepspeed/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..22d848dc8f9aab9c076500f4b3bda7805ec2410e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_152.txt @@ -0,0 +1 @@ +To deploy DeepSpeed on multiple GPUs, add the --num_gpus parameter. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_153.txt b/chunked/nltk_chunking/_deepspeed/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..d51ba104b7a5278e6ca511b963ca4f4b1b6d6e64 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_153.txt @@ -0,0 +1 @@ +If you want to use all available GPUs, you don't need to add --num_gpus. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_154.txt b/chunked/nltk_chunking/_deepspeed/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad6beea89aa03967ae89050edd26f12819c2d733 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_154.txt @@ -0,0 +1 @@ +The example below uses 2 GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_155.txt b/chunked/nltk_chunking/_deepspeed/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a61f9f715247f588d8a7267c05d3b7c47039f14 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_155.txt @@ -0,0 +1,9 @@ +deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \ +--deepspeed tests/deepspeed/ds_config_zero3.json \ +--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ +--output_dir output_dir --overwrite_output_dir --fp16 \ +--do_train --max_train_samples 500 --num_train_epochs 1 \ +--dataset_name wmt16 --dataset_config "ro-en" \ +--source_lang en --target_lang ro + +To deploy DeepSpeed on a single GPU, add the --num_gpus parameter. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_156.txt b/chunked/nltk_chunking/_deepspeed/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1f13c10d8f8ddebe410171fbe7c828bd880804d --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_156.txt @@ -0,0 +1 @@ +It isn't necessary to explicitly set this value if you only have 1 GPU because DeepSpeed deploys all GPUs it can see on a given node. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_157.txt b/chunked/nltk_chunking/_deepspeed/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..840d98150b1431bd5deceb4885243fcc0e44695e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_157.txt @@ -0,0 +1,10 @@ +deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \ +--deepspeed tests/deepspeed/ds_config_zero2.json \ +--model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \ +--output_dir output_dir --overwrite_output_dir --fp16 \ +--do_train --max_train_samples 500 --num_train_epochs 1 \ +--dataset_name wmt16 --dataset_config "ro-en" \ +--source_lang en --target_lang ro +DeepSpeed is still useful with just 1 GPU because you can: + +Offload some computations and memory to the CPU to make more GPU resources available to your model to use a larger batch size or fit a very large model that normally won't fit. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_158.txt b/chunked/nltk_chunking/_deepspeed/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c4719a5bff908aa1a02cc2cc4b88f7c5e14c0df --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_158.txt @@ -0,0 +1 @@ +Minimize memory fragmentation with it's smart GPU memory management system which also allows you to fit bigger models and data batches. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_159.txt b/chunked/nltk_chunking/_deepspeed/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..946eb4478b0962236d9759c55c187fe917f7b9be --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_159.txt @@ -0,0 +1 @@ +Set the allgather_bucket_size and reduce_bucket_size values to 2e8 in the ZeRO-2 configuration file to get better performance on a single GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_16.txt b/chunked/nltk_chunking/_deepspeed/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..01a29dfa39b2429553e28f6c027f8fd65bc15b9c --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_16.txt @@ -0,0 +1 @@ +If you have enough GPU memory make sure you disable CPU/NVMe offload to make everything faster. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_160.txt b/chunked/nltk_chunking/_deepspeed/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..ddff58a357cfa4b0fc93e6ca76c5c7d83495354e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_160.txt @@ -0,0 +1,2 @@ +Multi-node deployment +A node is one or more GPUs for running a workload. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_161.txt b/chunked/nltk_chunking/_deepspeed/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..e14986e787881982d8737ef1f281c8c14f11fe7d --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_161.txt @@ -0,0 +1 @@ +A more powerful setup is a multi-node setup which can be launched with the deepspeed launcher. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_162.txt b/chunked/nltk_chunking/_deepspeed/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9858e925971e286f831b8aa244c27d1c6f3c6ac --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_162.txt @@ -0,0 +1 @@ +For this guide, let's assume there are two nodes with 8 GPUs each. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_163.txt b/chunked/nltk_chunking/_deepspeed/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cf69dddd1f1ef034fc1b71da136226089491bfb --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_163.txt @@ -0,0 +1 @@ +The first node can be accessed ssh hostname1 and the second node with ssh hostname2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_164.txt b/chunked/nltk_chunking/_deepspeed/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb6d50eadd90633779fd1e10d7c27265af7e01b3 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_164.txt @@ -0,0 +1 @@ +Both nodes must be able to communicate with each other locally over ssh without a password. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_165.txt b/chunked/nltk_chunking/_deepspeed/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6b0a97b0df0ee0e1066d049743c9ee13a717ebc --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_165.txt @@ -0,0 +1 @@ +By default, DeepSpeed expects your multi-node environment to use a shared storage. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_166.txt b/chunked/nltk_chunking/_deepspeed/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb5f7d68fda5d64b42c9e817e4dfb6273228eb77 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_166.txt @@ -0,0 +1,8 @@ +If this is not the case and each node can only see the local filesystem, you need to adjust the config file to include a checkpoint to allow loading without access to a shared filesystem: +yaml +{ + "checkpoint": { + "use_node_local_storage": true + } +} +You could also use the [Trainer]'s --save_on_each_node argument to automatically add the above checkpoint to your config. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_167.txt b/chunked/nltk_chunking/_deepspeed/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..900d8af2412a2f199228dd6cc998b7ac3b704538 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_167.txt @@ -0,0 +1 @@ +For torchrun, you have to ssh to each node and run the following command on both of them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_168.txt b/chunked/nltk_chunking/_deepspeed/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..adf2f8e4025ddda62c90abf3e7579c01fc43dffd --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_168.txt @@ -0,0 +1 @@ +The launcher waits until both nodes are synchronized before launching the training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_169.txt b/chunked/nltk_chunking/_deepspeed/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e66677e60dbce0e2384fe52e271f34f1a5c3cbc --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_169.txt @@ -0,0 +1,4 @@ +python -m torch.run --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \ +--master_port=9901 your_program.py --deepspeed ds_config.json + +For the deepspeed launcher, start by creating a hostfile. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_17.txt b/chunked/nltk_chunking/_deepspeed/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c02144beddb127cfd62dfb04490234dd23c5ae9 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_17.txt @@ -0,0 +1,2 @@ +Select a ZeRO stage +After you've installed DeepSpeed and have a better idea of your memory requirements, the next step is selecting a ZeRO stage to use. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_170.txt b/chunked/nltk_chunking/_deepspeed/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..49d0114626bf1002577d410ad00ea17f4318065a --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_170.txt @@ -0,0 +1,3 @@ +hostname1 slots=8 +hostname2 slots=8 +Then you can launch the training with the following command. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_171.txt b/chunked/nltk_chunking/_deepspeed/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..f37dbdf4ba397d21adb9ce613e2a659d232849ac --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_171.txt @@ -0,0 +1 @@ +The deepspeed launcher automatically launches the command on both nodes at once. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_172.txt b/chunked/nltk_chunking/_deepspeed/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..347ef6cd37217f7fd43d02adebb2065a257340a1 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_172.txt @@ -0,0 +1,3 @@ +deepspeed --num_gpus 8 --num_nodes 2 --hostfile hostfile --master_addr hostname1 --master_port=9901 \ +your_program.py --deepspeed ds_config.json +Check out the Resource Configuration (multi-node) guide for more details about configuring multi-node compute resources. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_173.txt b/chunked/nltk_chunking/_deepspeed/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..229089cf5dfa9b25e0b12d818dc2c084fe835230 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_173.txt @@ -0,0 +1,2 @@ +SLURM +In a SLURM environment, you'll need to adapt your SLURM script to your specific SLURM environment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_174.txt b/chunked/nltk_chunking/_deepspeed/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac72e73fc6000b27498decffd6a37e518191580e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_174.txt @@ -0,0 +1,5 @@ +An example SLURM script may look like: +```bash +SBATCH --job-name=test-nodes # name +SBATCH --nodes=2 # nodes +SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node! \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_175.txt b/chunked/nltk_chunking/_deepspeed/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd5d3987df92e1309049cdabc78c0a6edbe8a636 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_175.txt @@ -0,0 +1,13 @@ +SBATCH --cpus-per-task=10 # number of cores per tasks +SBATCH --gres=gpu:8 # number of gpus +SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS) +SBATCH --output=%x-%j.out # output file name +export GPUS_PER_NODE=8 +export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1) +export MASTER_PORT=9901 +srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \ + --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \ + --master_addr $MASTER_ADDR --master_port $MASTER_PORT \ +your_program.py --deepspeed ds_config.json' + +Then you can schedule your multi-node deployment with the following command which launches training simultaneously on all nodes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_176.txt b/chunked/nltk_chunking/_deepspeed/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..d87d2a53cb65ecd970caa1a41aa156eecaef2e65 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_176.txt @@ -0,0 +1,3 @@ +sbatch launch.slurm +Notebook +The deepspeed launcher doesn't support deployment from a notebook so you'll need to emulate the distributed environment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_177.txt b/chunked/nltk_chunking/_deepspeed/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..68aa56f8bf2a8ce0bef8a96141b873e5f3d9165e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_177.txt @@ -0,0 +1 @@ +However, this only works for 1 GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_178.txt b/chunked/nltk_chunking/_deepspeed/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb99d480e7d223e40a6707e216c33c653773500e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_178.txt @@ -0,0 +1 @@ +If you want to use more than 1 GPU, you must use a multi-process environment for DeepSpeed to work. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_179.txt b/chunked/nltk_chunking/_deepspeed/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..56b453d4ef8fed1be236376fe80c8f93dabab5c5 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_179.txt @@ -0,0 +1 @@ +This means you have to use the deepspeed launcher which can't be emulated as shown here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_18.txt b/chunked/nltk_chunking/_deepspeed/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4259f8a79d02b6095124caf2ed2788cfa4e7674 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_18.txt @@ -0,0 +1,9 @@ +In order of fastest and most memory-efficient: +| Fastest | Memory efficient | +|------------------|------------------| +| ZeRO-1 | ZeRO-3 + offload | +| ZeRO-2 | ZeRO-3 | +| ZeRO-2 + offload | ZeRO-2 + offload | +| ZeRO-3 | ZeRO-2 | +| ZeRO-3 + offload | ZeRO-1 | +To find what works best for you, start with the fastest approach and if you run out of memory, try the next stage which is slower but more memory efficient. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_180.txt b/chunked/nltk_chunking/_deepspeed/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a9c8764b75f7e4d08b92bb76ab3c3c10a584758 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_180.txt @@ -0,0 +1 @@ +DeepSpeed requires a distributed environment even when only one process is used. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_181.txt b/chunked/nltk_chunking/_deepspeed/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..08071098fd3c6ab6445849e342b5584574f6ad93 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_181.txt @@ -0,0 +1,13 @@ +This emulates a launcher in the notebook +import os +os.environ["MASTER_ADDR"] = "localhost" +os.environ["MASTER_PORT"] = "9994" # modify if RuntimeError: Address already in use +os.environ["RANK"] = "0" +os.environ["LOCAL_RANK"] = "0" +os.environ["WORLD_SIZE"] = "1" +Now proceed as normal, plus pass the DeepSpeed config file +training_args = TrainingArguments(, deepspeed="ds_config_zero3.json") +trainer = Trainer() +trainer.train() + +If you want to create the config file on the fly in the notebook in the current directory, you could have a dedicated cell. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_182.txt b/chunked/nltk_chunking/_deepspeed/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..268d7ba1f5fe96510ac97a2fead8df704f3815af --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_182.txt @@ -0,0 +1,62 @@ +%%bash +cat <<'EOT' > ds_config_zero3.json +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + }, +"optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + "weight_decay": "auto" + } +}, + +"scheduler": { + "type": "WarmupLR", + "params": { + "warmup_min_lr": "auto", + "warmup_max_lr": "auto", + "warmup_num_steps": "auto" + } +}, + +"zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": true + }, + "offload_param": { + "device": "cpu", + "pin_memory": true + }, + "overlap_comm": true, + "contiguous_gradients": true, + "sub_group_size": 1e9, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_gather_16bit_weights_on_model_save": true +}, + +"gradient_accumulation_steps": "auto", +"gradient_clipping": "auto", +"steps_per_print": 2000, +"train_batch_size": "auto", +"train_micro_batch_size_per_gpu": "auto", +"wall_clock_breakdown": false + +} +EOT + +If the training script is in a file and not in a notebook cell, you can launch deepspeed normally from the shell in a notebook cell. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_183.txt b/chunked/nltk_chunking/_deepspeed/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..c96aa920e0b27760816f4214fb41460aa233c176 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_183.txt @@ -0,0 +1,5 @@ +For example, to launch run_translation.py: +py +!git clone https://github.com/huggingface/transformers +!cd transformers; deepspeed examples/pytorch/translation/run_translation.py +You could also use %%bash magic and write multi-line code to run the shell program, but you won't be able to view the logs until training is complete. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_184.txt b/chunked/nltk_chunking/_deepspeed/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ebfd831ec720a1a10066965ef7b7676df4238fd --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_184.txt @@ -0,0 +1 @@ +With %%bash magic, you don't need to emulate a distributed environment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_185.txt b/chunked/nltk_chunking/_deepspeed/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..179f25c3c934d255dc8d1f4525487fdc84fd3118 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_185.txt @@ -0,0 +1,7 @@ +%%bash +git clone https://github.com/huggingface/transformers +cd transformers +deepspeed examples/pytorch/translation/run_translation.py + +Save model weights +DeepSpeed stores the main full precision fp32 weights in custom checkpoint optimizer files (the glob pattern looks like global_step*/*optim_states.pt) and are saved under the normal checkpoint. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_186.txt b/chunked/nltk_chunking/_deepspeed/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..6832f83212792d63bdc6323a1085001e95563e6c --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_186.txt @@ -0,0 +1 @@ +A model trained with ZeRO-2 saves the pytorch_model.bin weights in fp16. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_187.txt b/chunked/nltk_chunking/_deepspeed/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..76c69d68e5aaac2303bd93cfc801277742b70614 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_187.txt @@ -0,0 +1 @@ +To save the model weights in fp16 for a model trained with ZeRO-3, you need to set "stage3_gather_16bit_weights_on_model_save": true because the model weights are partitioned across multiple GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_188.txt b/chunked/nltk_chunking/_deepspeed/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d111d84238666c93ed3105fdf7a90a3971b28a3 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_188.txt @@ -0,0 +1 @@ +Otherwise, the [Trainer] won't save the weights in fp16 and it won't create a pytorch_model.bin file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_189.txt b/chunked/nltk_chunking/_deepspeed/chunk_189.txt new file mode 100644 index 0000000000000000000000000000000000000000..68aa899c021ae4e20c4eaa732818578fadcb7334 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_189.txt @@ -0,0 +1 @@ +This is because DeepSpeed's state_dict contains a placeholder instead of the real weights and you won't be able to load them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_19.txt b/chunked/nltk_chunking/_deepspeed/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5489cb51a20dfcfc6f5deb2b919fbf525c2f9ad --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_19.txt @@ -0,0 +1 @@ +Feel free to work in whichever direction you prefer (starting with the most memory efficient or fastest) to discover the appropriate balance between speed and memory usage. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_190.txt b/chunked/nltk_chunking/_deepspeed/chunk_190.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c2dbe13530438b5072c2e137e97f8c26aacd827 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_190.txt @@ -0,0 +1,8 @@ +yaml +{ + "zero_optimization": { + "stage3_gather_16bit_weights_on_model_save": true + } +} + +The full precision weights shouldn't be saved during training because it can require a lot of memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_191.txt b/chunked/nltk_chunking/_deepspeed/chunk_191.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d5bd68838b9d7f6a20a216ed829411517858494 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_191.txt @@ -0,0 +1 @@ +It is usually best to save the fp32 weights offline after training is complete. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_192.txt b/chunked/nltk_chunking/_deepspeed/chunk_192.txt new file mode 100644 index 0000000000000000000000000000000000000000..6166e682cb5501ebeb26061e2452607f5a6f4326 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_192.txt @@ -0,0 +1 @@ +But if you have a lot of free CPU memory, it is possible to save the fp32 weights during training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_193.txt b/chunked/nltk_chunking/_deepspeed/chunk_193.txt new file mode 100644 index 0000000000000000000000000000000000000000..58d033d0a9dfa83f4df6f11d56d7d48f962f4e3c --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_193.txt @@ -0,0 +1 @@ +This section covers both online and offline approaches. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_194.txt b/chunked/nltk_chunking/_deepspeed/chunk_194.txt new file mode 100644 index 0000000000000000000000000000000000000000..b274a5d4e0e405c81036dcb414fa05e86cfb351b --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_194.txt @@ -0,0 +1,9 @@ +Online +You must have saved at least one checkpoint to load the latest checkpoint as shown in the following: + +from transformers.trainer_utils import get_last_checkpoint +from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint +checkpoint_dir = get_last_checkpoint(trainer.args.output_dir) +fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + +If you've enabled the --load_best_model_at_end parameter to track the best checkpoint in [TrainingArguments], you can finish training first and save the final model explicitly. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_195.txt b/chunked/nltk_chunking/_deepspeed/chunk_195.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb16fe0679e3057906ccb59bd5c2652257cd42da --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_195.txt @@ -0,0 +1,8 @@ +Then you can reload it as shown below: + +from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint +checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final") +trainer.deepspeed.save_checkpoint(checkpoint_dir) +fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + +Once load_state_dict_from_zero_checkpoint is run, the model is no longer usable in DeepSpeed in the context of the same application. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_196.txt b/chunked/nltk_chunking/_deepspeed/chunk_196.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bc5e1baa81d3f632c1076cc03fcc560f459efe3 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_196.txt @@ -0,0 +1 @@ +You'll need to initialize the DeepSpeed engine again since model.load_state_dict(state_dict) removes all the DeepSpeed magic from it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_197.txt b/chunked/nltk_chunking/_deepspeed/chunk_197.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea449340d5a8c3cb902d3513542da9c0c150cb5d --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_197.txt @@ -0,0 +1 @@ +Only use this at the very end of training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_198.txt b/chunked/nltk_chunking/_deepspeed/chunk_198.txt new file mode 100644 index 0000000000000000000000000000000000000000..08aae38d0504e91b1b51ea6167ca0c7c9661ea6c --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_198.txt @@ -0,0 +1,9 @@ +You can also extract and load the state_dict of the fp32 weights: + +from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint +state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu +model = model.cpu() +model.load_state_dict(state_dict) + +Offline +DeepSpeed provides a zero_to_fp32.py script at the top-level of the checkpoint folder for extracting weights at any point. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_199.txt b/chunked/nltk_chunking/_deepspeed/chunk_199.txt new file mode 100644 index 0000000000000000000000000000000000000000..79dbd9ab748b2a20f0a88ec124fc9e57f24d2a90 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_199.txt @@ -0,0 +1 @@ +This is a standalone script and you don't need a configuration file or [Trainer]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_2.txt b/chunked/nltk_chunking/_deepspeed/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..32d07b6042a88cbaf5cb6350b1443fc0aaedfb2d --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_2.txt @@ -0,0 +1,7 @@ +ZeRO works in several stages: + +ZeRO-1, optimizer state partioning across GPUs +ZeRO-2, gradient partitioning across GPUs +ZeRO-3, parameteter partitioning across GPUs + +In GPU-limited environments, ZeRO also enables offloading optimizer memory and computation from the GPU to the CPU to fit and train really large models on a single GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_20.txt b/chunked/nltk_chunking/_deepspeed/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b39f0ebd7e37546ef4175073835a54eea542d79 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_20.txt @@ -0,0 +1,16 @@ +A general process you can use is (start with batch size of 1): + +enable gradient checkpointing +try ZeRO-2 +try ZeRO-2 and offload the optimizer +try ZeRO-3 +try ZeRO-3 and offload parameters to the CPU +try ZeRO-3 and offload parameters and the optimizer to the CPU +try lowering various default values like a narrower search beam if you're using the [~GenerationMixin.generate] method +try mixed half-precision (fp16 on older GPU architectures and bf16 on Ampere) over full-precision weights +add more hardware if possible or enable Infinity to offload parameters and the optimizer to a NVMe +once you're not running out of memory, measure effective throughput and then try to increase the batch size as large as you can to maximize GPU efficiency +lastly, try to optimize your training setup by disabling some offload features or use a faster ZeRO stage and increasing/decreasing the batch size to find the best tradeoff between speed and memory usage + +DeepSpeed configuration file +DeepSpeed works with the [Trainer] class by way of a config file containing all the parameters for configuring how you want setup your training run. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_200.txt b/chunked/nltk_chunking/_deepspeed/chunk_200.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b74d93ec44223fa07a216e69bf4360bd265d0e6 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_200.txt @@ -0,0 +1,16 @@ +For example, if your checkpoint folder looked like this: + +$ ls -l output_dir/checkpoint-1/ +-rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json +drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/ +-rw-rw-r-- 1 stas stas 12 Mar 27 13:16 latest +-rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt +-rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin +-rw-rw-r-- 1 stas stas 623 Mar 27 20:42 scheduler.pt +-rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json +-rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model +-rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json +-rw-rw-r-- 1 stas stas 339 Mar 27 20:42 trainer_state.json +-rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin +-rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py* +To reconstruct the fp32 weights from the DeepSpeed checkpoint (ZeRO-2 or ZeRO-3) subfolder global_step1, run the following command to create and consolidate the full fp32 weights from multiple GPUs into a single pytorch_model.bin file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_201.txt b/chunked/nltk_chunking/_deepspeed/chunk_201.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f1793c2825ccf78774581ebf737b5010bcb6696 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_201.txt @@ -0,0 +1 @@ +The script automatically discovers the subfolder containing the checkpoint. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_202.txt b/chunked/nltk_chunking/_deepspeed/chunk_202.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2a1aefd6f31b31150200a6ea7af21a6ae3d88ea --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_202.txt @@ -0,0 +1,2 @@ +py +python zero_to_fp32.py . \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_203.txt b/chunked/nltk_chunking/_deepspeed/chunk_203.txt new file mode 100644 index 0000000000000000000000000000000000000000..628c479cb6c763603c97698c0badd8a844871de2 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_203.txt @@ -0,0 +1,3 @@ +pytorch_model.bin + +Run python zero_to_fp32.py -h for more usage details. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_204.txt b/chunked/nltk_chunking/_deepspeed/chunk_204.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab8884b5cdfc7e359b602b7ef645a9531ccd15ed --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_204.txt @@ -0,0 +1 @@ +The script requires 2x the general RAM of the final fp32 weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_205.txt b/chunked/nltk_chunking/_deepspeed/chunk_205.txt new file mode 100644 index 0000000000000000000000000000000000000000..d535f3790d8f2b6d962341e6ccf07ff67d457189 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_205.txt @@ -0,0 +1,2 @@ +ZeRO Inference +ZeRO Inference places the model weights in CPU or NVMe memory to avoid burdening the GPU which makes it possible to run inference with huge models on a GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_206.txt b/chunked/nltk_chunking/_deepspeed/chunk_206.txt new file mode 100644 index 0000000000000000000000000000000000000000..33ac83454d197a3f78c8003d52a1fbf766ec87b0 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_206.txt @@ -0,0 +1 @@ +Inference doesn't require any large additional amounts of memory for the optimizer states and gradients so you can fit much larger batches and/or sequence lengths on the same hardware. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_207.txt b/chunked/nltk_chunking/_deepspeed/chunk_207.txt new file mode 100644 index 0000000000000000000000000000000000000000..25464cdad9d2136008db1f015442e3f7d44f54d9 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_207.txt @@ -0,0 +1 @@ +ZeRO Inference shares the same configuration file as ZeRO-3, and ZeRO-2 and ZeRO-1 configs won't work because they don't provide any benefits for inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_208.txt b/chunked/nltk_chunking/_deepspeed/chunk_208.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4e07ac6876fcfec8f8e71fe4b9418a933428de5 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_208.txt @@ -0,0 +1 @@ +To run ZeRO Inference, pass your usual training arguments to the [TrainingArguments] class and add the --do_eval argument. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_209.txt b/chunked/nltk_chunking/_deepspeed/chunk_209.txt new file mode 100644 index 0000000000000000000000000000000000000000..067bc24d2700ddacb3eb3249c6218693b5f2c391 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_209.txt @@ -0,0 +1,3 @@ +deepspeed --num_gpus=2 your_program.py --do_eval --deepspeed ds_config.json +Non-Trainer DeepSpeed integration +DeepSpeed also works with Transformers without the [Trainer] class. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_21.txt b/chunked/nltk_chunking/_deepspeed/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..265212c5e4b0136a8c36b2ae3931b36682160347 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_21.txt @@ -0,0 +1 @@ +When you execute your training script, DeepSpeed logs the configuration it received from [Trainer] to the console so you can see exactly what configuration was used. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_210.txt b/chunked/nltk_chunking/_deepspeed/chunk_210.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d4344d5943b85bd68e894ebf50560c49d8ed93b --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_210.txt @@ -0,0 +1 @@ +This is handled by the [HfDeepSpeedConfig] which only takes care of gathering ZeRO-3 parameters and splitting a model across multiple GPUs when you call [~PreTrainedModel.from_pretrained]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_211.txt b/chunked/nltk_chunking/_deepspeed/chunk_211.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1953568614b9156e1868ae1694427e0c3aff558 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_211.txt @@ -0,0 +1 @@ +If you want everything automatically taken care of for you, try using DeepSpeed with the [Trainer]! \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_212.txt b/chunked/nltk_chunking/_deepspeed/chunk_212.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca95de5e9953d06ab3a82e7817ea2b68aad48d1a --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_212.txt @@ -0,0 +1 @@ +You'll need to follow the DeepSpeed documentation, and manually configure the parameter values in the config file (you can't use the "auto" value). \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_213.txt b/chunked/nltk_chunking/_deepspeed/chunk_213.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8fa42490f1f8a13a8301b7f64be521aa62b5e1d --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_213.txt @@ -0,0 +1,12 @@ +To efficiently deploy ZeRO-3, you must instantiate the [HfDeepSpeedConfig] object before the model and keep that object alive: + +from transformers.integrations import HfDeepSpeedConfig +from transformers import AutoModel +import deepspeed +ds_config = {} # deepspeed config object or path to the file +must run before instantiating the model to detect zero 3 +dschf = HfDeepSpeedConfig(ds_config) # keep this object alive +model = AutoModel.from_pretrained("openai-community/gpt2") +engine = deepspeed.initialize(model=model, config_params=ds_config, ) + +[HfDeepSpeedConfig] is not required for ZeRO-1 or ZeRO-2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_214.txt b/chunked/nltk_chunking/_deepspeed/chunk_214.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c9550b3d3833cdbb78d8a08152da0f196c3e73f --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_214.txt @@ -0,0 +1,12 @@ +from transformers.integrations import HfDeepSpeedConfig +from transformers import AutoModel, AutoConfig +import deepspeed +ds_config = {} # deepspeed config object or path to the file +must run before instantiating the model to detect zero 3 +dschf = HfDeepSpeedConfig(ds_config) # keep this object alive +config = AutoConfig.from_pretrained("openai-community/gpt2") +model = AutoModel.from_config(config) +engine = deepspeed.initialize(model=model, config_params=ds_config, ) + +Non-Trainer ZeRO Inference +To run ZeRO Inference without the [Trainer] in cases where you can’t fit a model onto a single GPU, try using additional GPUs or/and offloading to CPU memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_215.txt b/chunked/nltk_chunking/_deepspeed/chunk_215.txt new file mode 100644 index 0000000000000000000000000000000000000000..5daeda664e767cebd77a46eb3d81734346229741 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_215.txt @@ -0,0 +1 @@ +The important nuance to understand here is that the way ZeRO is designed, you can process different inputs on different GPUs in parallel. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_216.txt b/chunked/nltk_chunking/_deepspeed/chunk_216.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ccaa2f7c5c20f7a7e8822fca92ce8b96a43e70b --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_216.txt @@ -0,0 +1,3 @@ +Make sure to: + +disable CPU offload if you have enough GPU memory (since it slows things down). \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_217.txt b/chunked/nltk_chunking/_deepspeed/chunk_217.txt new file mode 100644 index 0000000000000000000000000000000000000000..e458c5a8d0adaaaac0a6515b093dff1ec760cb77 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_217.txt @@ -0,0 +1 @@ +enable bf16 if you have an Ampere or newer GPU to make things faster. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_218.txt b/chunked/nltk_chunking/_deepspeed/chunk_218.txt new file mode 100644 index 0000000000000000000000000000000000000000..41e9ae62cb01a0674e55fe39acf74b1d17653b74 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_218.txt @@ -0,0 +1 @@ +If you don’t have one of these GPUs, you may enable fp16 as long as you don’t use a model pretrained in bf16 (T5 models) because it may lead to an overflow error. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_219.txt b/chunked/nltk_chunking/_deepspeed/chunk_219.txt new file mode 100644 index 0000000000000000000000000000000000000000..87257ed0daf24e52ae74ee0b50790f2fd88e5e8b --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_219.txt @@ -0,0 +1 @@ +Take a look at the following script to get a better idea of how to run ZeRO Inference without the [Trainer] on a model that won't fit on a single GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_22.txt b/chunked/nltk_chunking/_deepspeed/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..83c6b7f9700c13bc36e0729fb5eec9bdc4bde977 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_22.txt @@ -0,0 +1 @@ +Find a complete list of DeepSpeed configuration options on the DeepSpeed Configuration JSON reference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_220.txt b/chunked/nltk_chunking/_deepspeed/chunk_220.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f114b7736c63cecdeeca2ad4435bf73bd2b3f18 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_220.txt @@ -0,0 +1,5 @@ +!/usr/bin/env python +This script demonstrates how to use Deepspeed ZeRO in an inference mode when one can't fit a model +into a single GPU + +1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_221.txt b/chunked/nltk_chunking/_deepspeed/chunk_221.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce14e06cd0d285fc0b3299af8bbb87e938989bb7 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_221.txt @@ -0,0 +1,2 @@ +Use 1 GPU with CPU offload +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_222.txt b/chunked/nltk_chunking/_deepspeed/chunk_222.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6b8e5830cdd2056c3adfa9f22a4151880395bc7 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_222.txt @@ -0,0 +1,6 @@ +Or use multiple GPUs instead + +First you need to install deepspeed: pip install deepspeed + +Here we use a 3B "bigscience/T0_3B" model which needs about 15GB GPU RAM - so 1 largish or 2 +small GPUs can handle it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_223.txt b/chunked/nltk_chunking/_deepspeed/chunk_223.txt new file mode 100644 index 0000000000000000000000000000000000000000..a61098f98a57f0b041394acfc332fb1b77cd9c5a --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_223.txt @@ -0,0 +1 @@ +or 1 small GPU and a lot of CPU memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_224.txt b/chunked/nltk_chunking/_deepspeed/chunk_224.txt new file mode 100644 index 0000000000000000000000000000000000000000..091d3ea92793255a5020aea7cb37c2a35e49974a --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_224.txt @@ -0,0 +1,2 @@ +To use a larger model like "bigscience/T0" which needs about 50GB, unless you have an 80GB GPU - +you will need 2-4 gpus. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_225.txt b/chunked/nltk_chunking/_deepspeed/chunk_225.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d3340ded6597ce3b27fe29846346f217e588c7e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_225.txt @@ -0,0 +1,2 @@ +And then you can adapt the script to handle more gpus if you want to +process multiple inputs at once. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_226.txt b/chunked/nltk_chunking/_deepspeed/chunk_226.txt new file mode 100644 index 0000000000000000000000000000000000000000..62e29f40a9d915a4df2b81020f84088135687c42 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_226.txt @@ -0,0 +1,3 @@ +The provided deepspeed config also activates CPU memory offloading, so chances are that if you +have a lot of available CPU memory and you don't mind a slowdown you should be able to load a +model that doesn't normally fit into a single GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_227.txt b/chunked/nltk_chunking/_deepspeed/chunk_227.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ad1b3e6dd68cddef8ae8e43af427118e4ffd88b --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_227.txt @@ -0,0 +1,2 @@ +If you have enough GPU memory the program will +run faster if you don't want offload to CPU - so disable that section then. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_228.txt b/chunked/nltk_chunking/_deepspeed/chunk_228.txt new file mode 100644 index 0000000000000000000000000000000000000000..689a2f40ebce3c74f4d4e1004af14697d0cb3a27 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_228.txt @@ -0,0 +1,31 @@ +To deploy on 1 gpu: + +deepspeed --num_gpus 1 t0.py +or: +python -m torch.distributed.run --nproc_per_node=1 t0.py + +To deploy on 2 gpus: + +deepspeed --num_gpus 2 t0.py +or: +python -m torch.distributed.run --nproc_per_node=2 t0.py +from transformers import AutoTokenizer, AutoConfig, AutoModelForSeq2SeqLM +from transformers.integrations import HfDeepSpeedConfig +import deepspeed +import os +import torch +os.environ["TOKENIZERS_PARALLELISM"] = "false" # To avoid warnings about parallelism in tokenizers +distributed setup +local_rank = int(os.getenv("LOCAL_RANK", "0")) +world_size = int(os.getenv("WORLD_SIZE", "1")) +torch.cuda.set_device(local_rank) +deepspeed.init_distributed() +model_name = "bigscience/T0_3B" +config = AutoConfig.from_pretrained(model_name) +model_hidden_size = config.d_model +batch size has to be divisible by world_size, but can be bigger than world_size +train_batch_size = 1 * world_size +ds_config notes + +- enable bf16 if you use Ampere or higher GPU - this will run in mixed precision and will be +faster. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_229.txt b/chunked/nltk_chunking/_deepspeed/chunk_229.txt new file mode 100644 index 0000000000000000000000000000000000000000..02e9d421ac39f4e9da4bf43b2b42616851f71422 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_229.txt @@ -0,0 +1 @@ +- for older GPUs you can enable fp16, but it'll only work for non-bf16 pretrained models - e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_23.txt b/chunked/nltk_chunking/_deepspeed/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c35be0ec2f52494611e8a94539bad615f637a86 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_23.txt @@ -0,0 +1 @@ +You can also find more practical examples of various DeepSpeed configuration examples on the DeepSpeedExamples repository or the main DeepSpeed repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_230.txt b/chunked/nltk_chunking/_deepspeed/chunk_230.txt new file mode 100644 index 0000000000000000000000000000000000000000..609dd75d3eee2c71e9ca20ade3640c8c60e01b3f --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_230.txt @@ -0,0 +1,39 @@ +all official t5 models are bf16-pretrained + +- set offload_param.device to "none" or completely remove the offload_param section if you don't +- want CPU offload + +- if using offload_param you can manually finetune stage3_param_persistence_threshold to control +- which params should remain on gpus - the larger the value the smaller the offload size + +For in-depth info on Deepspeed config see +https://huggingface.co/docs/transformers/main/main_classes/deepspeed +keeping the same format as json for consistency, except it uses lower case for true/false +fmt: off +ds_config = { + "fp16": { + "enabled": False + }, + "bf16": { + "enabled": False + }, + "zero_optimization": { + "stage": 3, + "offload_param": { + "device": "cpu", + "pin_memory": True + }, + "overlap_comm": True, + "contiguous_gradients": True, + "reduce_bucket_size": model_hidden_size * model_hidden_size, + "stage3_prefetch_bucket_size": 0.9 * model_hidden_size * model_hidden_size, + "stage3_param_persistence_threshold": 10 * model_hidden_size + }, + "steps_per_print": 2000, + "train_batch_size": train_batch_size, + "train_micro_batch_size_per_gpu": 1, + "wall_clock_breakdown": False +} +fmt: on +next line instructs transformers to partition the model directly over multiple gpus using +deepspeed.zero.Init when model's from_pretrained method is called. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_231.txt b/chunked/nltk_chunking/_deepspeed/chunk_231.txt new file mode 100644 index 0000000000000000000000000000000000000000..98c3c805aab06361fff63ccb49b2df17c7d8a1a1 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_231.txt @@ -0,0 +1,6 @@ +it has to be run before loading the model AutoModelForSeq2SeqLM.from_pretrained(model_name) + +otherwise the model will first be loaded normally and only partitioned at forward time which is +less efficient and when there is little CPU RAM may fail +dschf = HfDeepSpeedConfig(ds_config) # keep this object alive +now a model can be loaded. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_232.txt b/chunked/nltk_chunking/_deepspeed/chunk_232.txt new file mode 100644 index 0000000000000000000000000000000000000000..240e1ddbd999fe11625baf6f7e2c18f9d199f923 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_232.txt @@ -0,0 +1,5 @@ +model = AutoModelForSeq2SeqLM.from_pretrained(model_name) +initialise Deepspeed ZeRO and store only the engine object +ds_engine = deepspeed.initialize(model=model, config_params=ds_config)[0] +ds_engine.module.eval() # inference +Deepspeed ZeRO can process unrelated inputs on each GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_233.txt b/chunked/nltk_chunking/_deepspeed/chunk_233.txt new file mode 100644 index 0000000000000000000000000000000000000000..c1b152a36e3b2380d266cd6a9c1f90f3b454055f --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_233.txt @@ -0,0 +1 @@ +So for 2 gpus you process 2 inputs at once. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_234.txt b/chunked/nltk_chunking/_deepspeed/chunk_234.txt new file mode 100644 index 0000000000000000000000000000000000000000..4180c266623e4e59b59fe3d151f4e2717fa31e9e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_234.txt @@ -0,0 +1 @@ +If you use more GPUs adjust for more. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_235.txt b/chunked/nltk_chunking/_deepspeed/chunk_235.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3486db1076ee897175669e78a11fdca4e2a0de3 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_235.txt @@ -0,0 +1,5 @@ +And of course if you have just one input to process you then need to pass the same string to both gpus +If you use only one GPU, then you will have only rank 0. +rank = torch.distributed.get_rank() +if rank == 0: + text_in = "Is this review positive or negative? \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_236.txt b/chunked/nltk_chunking/_deepspeed/chunk_236.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a626c583ae024dfaa0a3553df78b9fab8fc354f --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_236.txt @@ -0,0 +1,3 @@ +Review: this is the best cast iron skillet you will ever buy" +elif rank == 1: + text_in = "Is this review positive or negative? \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_237.txt b/chunked/nltk_chunking/_deepspeed/chunk_237.txt new file mode 100644 index 0000000000000000000000000000000000000000..568375e9faad734bbb487a98f7bed318ea5c633f --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_237.txt @@ -0,0 +1,13 @@ +Review: this is the worst restaurant ever" +tokenizer = AutoTokenizer.from_pretrained(model_name) +inputs = tokenizer.encode(text_in, return_tensors="pt").to(device=local_rank) +with torch.no_grad(): + outputs = ds_engine.module.generate(inputs, synced_gpus=True) +text_out = tokenizer.decode(outputs[0], skip_special_tokens=True) +print(f"rank{rank}:\n in={text_in}\n out={text_out}") + +Save the script as t0.py and launch it: + +$ deepspeed --num_gpus 2 t0.py +rank0: + in=Is this review positive or negative? \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_238.txt b/chunked/nltk_chunking/_deepspeed/chunk_238.txt new file mode 100644 index 0000000000000000000000000000000000000000..01c52f85d4fd1386f6a32650127db5ef2273cfa3 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_238.txt @@ -0,0 +1,4 @@ +Review: this is the best cast iron skillet you will ever buy + out=Positive +rank1: + in=Is this review positive or negative? \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_239.txt b/chunked/nltk_chunking/_deepspeed/chunk_239.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a41133e3680fb3b988050d112a6cbc306244639 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_239.txt @@ -0,0 +1,3 @@ +Review: this is the worst restaurant ever + out=negative +This is a very basic example and you'll want to adapt it to your use case. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_24.txt b/chunked/nltk_chunking/_deepspeed/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..60cb881fe25232dfb8a24720af9eacd4006f0a73 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_24.txt @@ -0,0 +1,5 @@ +To quickly find specific examples, you can: +```bash +git clone https://github.com/microsoft/DeepSpeedExamples +cd DeepSpeedExamples +find . \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_240.txt b/chunked/nltk_chunking/_deepspeed/chunk_240.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b1675345714f023a80cfc0ab301a549638926ce --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_240.txt @@ -0,0 +1,2 @@ +Generate +Using multiple GPUs with ZeRO-3 for generation requires synchronizing the GPUs by setting synced_gpus=True in the [~GenerationMixin.generate] method. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_241.txt b/chunked/nltk_chunking/_deepspeed/chunk_241.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0cae88bf7e2ac25a5703e5fd4999dc3f61f7103 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_241.txt @@ -0,0 +1 @@ +Otherwise, if one GPU is finished generating before another one, the whole system hangs because the remaining GPUs haven't received the weight shard from the GPU that finished first. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_242.txt b/chunked/nltk_chunking/_deepspeed/chunk_242.txt new file mode 100644 index 0000000000000000000000000000000000000000..0423c76b71de20639a5eae195a37eee79c715247 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_242.txt @@ -0,0 +1 @@ +For Transformers>=4.28, if synced_gpus is automatically set to True if multiple GPUs are detected during generation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_243.txt b/chunked/nltk_chunking/_deepspeed/chunk_243.txt new file mode 100644 index 0000000000000000000000000000000000000000..68b103857d8828509e9db0a67d889ce135f729fc --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_243.txt @@ -0,0 +1,2 @@ +Troubleshoot +When you encounter an issue, you should consider whether DeepSpeed is the cause of the problem because often it isn't (unless it's super obviously and you can see DeepSpeed modules in the exception)! \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_244.txt b/chunked/nltk_chunking/_deepspeed/chunk_244.txt new file mode 100644 index 0000000000000000000000000000000000000000..31fcff1b02849c6cfc7aad6a7ae78471149d73c7 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_244.txt @@ -0,0 +1 @@ +The first step should be to retry your setup without DeepSpeed, and if the problem persists, then you can report the issue. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_245.txt b/chunked/nltk_chunking/_deepspeed/chunk_245.txt new file mode 100644 index 0000000000000000000000000000000000000000..d23931ba70e8f3effb6cacc24d49d9f8e3e6bbd1 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_245.txt @@ -0,0 +1 @@ +If the issue is a core DeepSpeed problem and unrelated to the Transformers integration, open an Issue on the DeepSpeed repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_246.txt b/chunked/nltk_chunking/_deepspeed/chunk_246.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d00c6cc429d03180bb28ae73897134b98b02253 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_246.txt @@ -0,0 +1,17 @@ +For issues related to the Transformers integration, please provide the following information: + +the full DeepSpeed config file + +the command line arguments of the [Trainer], or [TrainingArguments] arguments if you're scripting the [Trainer] setup yourself (don't dump the [TrainingArguments] which has dozens of irrelevant entries) + +the outputs of: + +python -c 'import torch; print(f"torch: {torch.__version__}")' +python -c 'import transformers; print(f"transformers: {transformers.__version__}")' +python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")' + +a link to a Google Colab notebook to reproduce the issue + +if impossible, a standard and non-custom dataset we can use and also try to use an existing example to reproduce the issue with + +The following sections provide a guide for resolving two of the most common issues. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_247.txt b/chunked/nltk_chunking/_deepspeed/chunk_247.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a7383f5ab1943f1c61c85f84b2f4ca4123d0eac --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_247.txt @@ -0,0 +1,2 @@ +DeepSpeed process killed at startup +When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than your system has or your process tried to allocate more CPU memory than allowed leading the OS kernel to terminate the process. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_248.txt b/chunked/nltk_chunking/_deepspeed/chunk_248.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cc740435bdb269d1d0f4a85e9d36c9acc08bac3 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_248.txt @@ -0,0 +1 @@ +In this case, check whether your configuration file has either offload_optimizer, offload_param or both configured to offload to the CPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_249.txt b/chunked/nltk_chunking/_deepspeed/chunk_249.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b08b2ddd4c5408711b49d237a88d839066c1cb0 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_249.txt @@ -0,0 +1 @@ +If you have NVMe and ZeRO-3 setup, experiment with offloading to the NVMe (estimate the memory requirements for your model). \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_25.txt b/chunked/nltk_chunking/_deepspeed/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..9a210474e5d36be72d6c8a3cda15127be88de254 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_25.txt @@ -0,0 +1,3 @@ +-name '*json' +find examples with the Lamb optimizer +grep -i Lamb $(find . \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_250.txt b/chunked/nltk_chunking/_deepspeed/chunk_250.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad7ec49bddf99aa1bb87785effd7d59cfd08f89 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_250.txt @@ -0,0 +1,2 @@ +NaN loss +NaN loss often occurs when a model is pretrained in bf16 and then you try to use it with fp16 (especially relevant for TPU trained models). \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_251.txt b/chunked/nltk_chunking/_deepspeed/chunk_251.txt new file mode 100644 index 0000000000000000000000000000000000000000..83034be6800994550a0c8cd75183c107eec76f30 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_251.txt @@ -0,0 +1 @@ +To resolve this, use fp32 or bf16 if your hardware supports it (TPU, Ampere GPUs or newer). \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_252.txt b/chunked/nltk_chunking/_deepspeed/chunk_252.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fde88ce34ffcf99b4462b0bb98ffe3d8ae9ba39 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_252.txt @@ -0,0 +1 @@ +The other issue may be related to using fp16. \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_253.txt b/chunked/nltk_chunking/_deepspeed/chunk_253.txt new file mode 100644 index 0000000000000000000000000000000000000000..1924b6b02b0ec8da9885b7422ff8f60cb3c0f0a5 --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_253.txt @@ -0,0 +1,13 @@ +For example, if this is your fp16 configuration: +yaml +{ + "fp16": { + "enabled": "auto", + "loss_scale": 0, + "loss_scale_window": 1000, + "initial_scale_power": 16, + "hysteresis": 2, + "min_loss_scale": 1 + } +} +You might see the following OVERFLOW! \ No newline at end of file diff --git a/chunked/nltk_chunking/_deepspeed/chunk_254.txt b/chunked/nltk_chunking/_deepspeed/chunk_254.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dad5e808aa3e0256f2de3c5747178fff21bec1e --- /dev/null +++ b/chunked/nltk_chunking/_deepspeed/chunk_254.txt @@ -0,0 +1,4 @@ +messages in the logs: + +0%| | 0/189 [00:00 len(list2): + results.extend(list1[i+1:]) + else: + results.extend(list2[i+1:]) + return results + +""" +`` +For demonstration purposes, we duplicate the system prompt by ten so that the input length is long enough to observe Flash Attention's memory savings. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_134.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a3e2355efd9c619e380e1c9c93210402470e8d6 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_134.txt @@ -0,0 +1,4 @@ +We append the original text prompt"Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here"` +python +long_prompt = 10 * system_prompt + prompt +We instantiate our model again in bfloat16 precision. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_135.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..af7cd2638de7f62e87cc3b70919ff311509b5fb8 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_135.txt @@ -0,0 +1,6 @@ +thon +model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto") +tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder") +pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + +Let's now run the model just like before without Flash Attention and measure the peak GPU memory requirement and inference time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_136.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..2202fff91e19699fa7a52382adc3945e6433467c --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_136.txt @@ -0,0 +1,5 @@ +thon +import time +start_time = time.time() +result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):] +print(f"Generated in {time.time() - start_time} seconds.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_137.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..259623816e5024ace4eb9623b9d71659a6f67d9a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_137.txt @@ -0,0 +1,5 @@ +result + +Output: + +Generated in 10.96854019165039 seconds. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_138.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b3483cfedb34ff832fb04eaac426586a0fca55c --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_138.txt @@ -0,0 +1 @@ +Sure. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_139.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..40bf948f8a64dfd3ffee41e031bc2e56e23b8947 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_139.txt @@ -0,0 +1 @@ +Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_14.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee85c37add9a5c3b9e680b0a73a2d3d39ca24f91 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_14.txt @@ -0,0 +1 @@ +1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_140.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..59721de1d6bfb62ed7c64461114a0530c9e10d84 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_140.txt @@ -0,0 +1,3 @@ +Here is a function that does that.\n\ndef +` +We're getting the same output as before, however this time, the model repeats the answer multiple times until it's 60 tokens cut-off. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_141.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..6da491543b1603e5091bb1306f83f26474fa1f2b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_141.txt @@ -0,0 +1 @@ +This is not surprising as we've repeated the system prompt ten times for demonstration purposes and thus cued the model to repeat itself. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_142.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..24b33f735d8bd88247992d58b784b57152cba805 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_142.txt @@ -0,0 +1 @@ +Note that the system prompt should not be repeated ten times in real-world applications - one time is enough! \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_143.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9e36c7ac384d4730aadc26fdea5bfdc0c14d3cd --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_143.txt @@ -0,0 +1 @@ +Let's measure the peak GPU memory requirement. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_144.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..d218862b801051deaf101fb22060db4b4ca5fa81 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_144.txt @@ -0,0 +1,6 @@ +python +bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) +Output: + +37.668193340301514 +As we can see the peak GPU memory requirement is now significantly higher than in the beginning, which is largely due to the longer input sequence. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_145.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1929bad140447f7c24786a88855c546d41cdb02 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_145.txt @@ -0,0 +1 @@ +Also the generation takes a little over a minute now. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_146.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e86a8725a8fc42719684a88e26f41fed684d045 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_146.txt @@ -0,0 +1 @@ +We call flush() to free GPU memory for our next experiment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_147.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..1636a3f931925cd77aade985f59294d70852542f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_147.txt @@ -0,0 +1,3 @@ +python +flush() +For comparison, let's run the same function, but enable Flash Attention instead. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_148.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e6a91da062935a013898180a9045f80257382bd --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_148.txt @@ -0,0 +1 @@ +To do so, we convert the model to BetterTransformer and by doing so enabling PyTorch's SDPA self-attention which in turn is able to use Flash Attention. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_149.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..30940939265463e3a66b964de537b9c89d727efe --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_149.txt @@ -0,0 +1,3 @@ +python +model.to_bettertransformer() +Now we run the exact same code snippet as before and under the hood Transformers will make use of Flash Attention. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_15.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..a47eb9cf6a729ba5b6bd7ad589449e2d02dc293c --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_15.txt @@ -0,0 +1,2 @@ +Lower Precision +Memory requirements of LLMs can be best understood by seeing the LLM as a set of weight matrices and vectors and the text inputs as a sequence of vectors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_150.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..be4cf5819b32b6fc08f67212a57c5e87bda4bc6b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_150.txt @@ -0,0 +1,4 @@ +start_time = time.time() +with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):] +print(f"Generated in {time.time() - start_time} seconds.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_151.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b60610c9db994d6276f55e41ed19701a690a06f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_151.txt @@ -0,0 +1,4 @@ +result + +Output: +Generated in 3.0211617946624756 seconds. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_152.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b3483cfedb34ff832fb04eaac426586a0fca55c --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_152.txt @@ -0,0 +1 @@ +Sure. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_153.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..40bf948f8a64dfd3ffee41e031bc2e56e23b8947 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_153.txt @@ -0,0 +1 @@ +Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_154.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d496a401fa6bb8a9786a14966be5fd34c098262 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_154.txt @@ -0,0 +1,2 @@ +Here is a function that does that.\n\ndef +We're getting the exact same result as before, but can observe a very significant speed-up thanks to Flash Attention. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_155.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..9336b4eeca7ee514f134a3f2e0edf3024eff3caa --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_155.txt @@ -0,0 +1 @@ +Let's measure the memory consumption one last time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_156.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..c019cd8b6a77900fedbe6bb04443a900f5b7c02a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_156.txt @@ -0,0 +1,5 @@ +python +bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) +Output: +32.617331981658936 +And we're almost back to our original 29GB peak GPU memory from the beginning. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_157.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ef694b64246d7b0128b04dcb18703508d889ea5 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_157.txt @@ -0,0 +1 @@ +We can observe that we only use roughly 100MB more GPU memory when passing a very long input sequence with Flash Attention compared to passing a short input sequence as done in the beginning. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_158.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..89af3963135a77d7d31cbd884e70d0e08acd5dd4 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_158.txt @@ -0,0 +1,3 @@ +py +flush() +For more information on how to use Flash Attention, please have a look at this doc page. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_159.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_159.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_16.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..54516dbfc8c9ba7dd61e369a650db7538df57d18 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_16.txt @@ -0,0 +1 @@ +In the following, the definition weights will be used to signify all model weight matrices and vectors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_160.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..114f8b7e3cb2d764a819775edac1a8bd35747211 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_160.txt @@ -0,0 +1,7 @@ +Architectural Innovations +So far we have looked into improving computational and memory efficiency by: + +Casting the weights to a lower precision format +Replacing the self-attention algorithm with a more memory- and compute efficient version + +Let's now look into how we can change the architecture of an LLM so that it is most effective and efficient for task that require long text inputs, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_161.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..3127753795c7b3e86ff6e20173073d0c3393f61b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_161.txt @@ -0,0 +1,5 @@ +: +- Retrieval augmented Questions Answering, +- Summarization, +- Chat +Note that chat not only requires the LLM to handle long text inputs, but it also necessitates that the LLM is able to efficiently handle the back-and-forth dialogue between user and assistant (such as ChatGPT). \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_162.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..06e78aec9a87987ddc29c4fcca3425cf8cde8e03 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_162.txt @@ -0,0 +1 @@ +Once trained, the fundamental LLM architecture is difficult to change, so it is important to make considerations about the LLM's tasks beforehand and accordingly optimize the model's architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_163.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ac6a36cecd35299fc2d58eba1d6912649593b94 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_163.txt @@ -0,0 +1 @@ +There are two important components of the model architecture that quickly become memory and/or performance bottlenecks for large input sequences. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_164.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..17f665281e4b6f37c21d43ba34b75a74c9445546 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_164.txt @@ -0,0 +1,6 @@ +The positional embeddings +The key-value cache + +Let's go over each component in more detail +3.1 Improving positional embeddings of LLMs +Self-attention puts each token in relation to each other's tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_165.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ab395f0330c568fb424068c46e0ee270449facd --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_165.txt @@ -0,0 +1,3 @@ +As an example, the \( \text{Softmax}(\mathbf{QK}^T) \) matrix of the text input sequence "Hello", "I", "love", "you" could look as follows: + +Each word token is given a probability mass at which it attends all other word tokens and, therefore is put into relation with all other word tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_166.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c6c42b6981712ccfb35860e2422a687ff0e1372 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_166.txt @@ -0,0 +1 @@ +E.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_167.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba52e7e310a0cd90fe88a151bf111e203ef1244f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_167.txt @@ -0,0 +1 @@ +the word "love" attends to the word "Hello" with 5%, to "I" with 30%, and to itself with 65%. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_168.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5974c1f24fc4be516cfe0d9c1c2348555ca966e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_168.txt @@ -0,0 +1 @@ +A LLM based on self-attention, but without position embeddings would have great difficulties in understanding the positions of the text inputs to each other. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_169.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..92f7fb3d7f34e3d49a635dbb793f3a9118b9f678 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_169.txt @@ -0,0 +1 @@ +This is because the probability score computed by \( \mathbf{QK}^T \) relates each word token to each other word token in \( O(1) \) computations regardless of their relative positional distance to each other. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_17.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c62b7963f9edcad4eb21b6276bfa6cbd33841a8 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_17.txt @@ -0,0 +1 @@ +At the time of writing this guide, LLMs consist of at least a couple billion parameters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_170.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..42b1615c4bc4554d4dc406b989b1702a8c0a22c0 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_170.txt @@ -0,0 +1 @@ +Therefore, for the LLM without position embeddings each token appears to have the same distance to all other tokens, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_171.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..c904a1b2ca85ada87b847cd0bf8eed7146280f52 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_171.txt @@ -0,0 +1 @@ +differentiating between "Hello I love you" and "You love I hello" would be very challenging. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_172.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..3710d7cb225a2a5c7d4787329e425c9c04a844cb --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_172.txt @@ -0,0 +1 @@ +For the LLM to understand sentence order, an additional cue is needed and is usually applied in the form of positional encodings (or also called positional embeddings). \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_173.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f89a194319b3f7118f451bdca75a2d295f4e020 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_173.txt @@ -0,0 +1 @@ +Positional encodings, encode the position of each token into a numerical presentation that the LLM can leverage to better understand sentence order. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_174.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..b37ca0a9d65a5ef94bcd09976d6f57f8094bf586 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_174.txt @@ -0,0 +1 @@ +The authors of the Attention Is All You Need paper introduced sinusoidal positional embeddings \( \mathbf{P} = \mathbf{p}_1, \ldots, \mathbf{p}_N \) . \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_175.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1c89ff7871280bce829fcdfb25f46002d3da510 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_175.txt @@ -0,0 +1 @@ +where each vector \( \mathbf{p}_i \) is computed as a sinusoidal function of its position \( i \) . \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_176.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7515f45ad5f0212ed8d5e442203020ede94b5e8 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_176.txt @@ -0,0 +1 @@ +The positional encodings are then simply added to the input sequence vectors \( \mathbf{\hat{X}} = \mathbf{\hat{x}}_1, \ldots, \mathbf{\hat{x}}_N \) = \( \mathbf{x}_1 + \mathbf{p}_1, \ldots, \mathbf{x}_N + \mathbf{p}_N \) thereby cueing the model to better learn sentence order. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_177.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec92e3d26d4b4d5b8035daf29c672486b3f2bf3f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_177.txt @@ -0,0 +1 @@ +Instead of using fixed position embeddings, others (such as Devlin et al.) \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_178.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a568337a58d91d973be0295385e5a8c84209930 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_178.txt @@ -0,0 +1,2 @@ +used learned positional encodings for which the positional embeddings +\( \mathbf{P} \) are learned during training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_179.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..de224516fe1ad811e288cb97f5b5a1694cff9854 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_179.txt @@ -0,0 +1,3 @@ +Sinusoidal and learned position embeddings used to be the predominant methods to encode sentence order into LLMs, but a couple of problems related to these positional encodings were found: + +Sinusoidal and learned position embeddings are both absolute positional embeddings, i.e. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_18.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a9aec89f31fd2357f83a61b893c49115c4e494c --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_18.txt @@ -0,0 +1 @@ +Each parameter thereby is made of a decimal number, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_180.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..2231492487adc8cdf047f6f6b9f68a06b4d4dc12 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_180.txt @@ -0,0 +1 @@ +encoding a unique embedding for each position id: \( 0, \ldots, N \) . \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_181.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5de0e3d9b2bc550596c48aedd5b172fb359fa30 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_181.txt @@ -0,0 +1 @@ +As shown by Huang et al. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_182.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..c972bca8da386c26377e425e2412564b14ad2e2b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_182.txt @@ -0,0 +1 @@ +and Su et al., absolute positional embeddings lead to poor LLM performance for long text inputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_183.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e48b2fc4f03239e7e42733ec4a6814ba0516035 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_183.txt @@ -0,0 +1 @@ +For long text inputs, it is advantageous if the model learns the relative positional distance input tokens have to each other instead of their absolute position. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_184.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..e3925efc750e1bcea38c391e75581fa8bb7650df --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_184.txt @@ -0,0 +1 @@ +When using learned position embeddings, the LLM has to be trained on a fixed input length \( N \), which makes it difficult to extrapolate to an input length longer than what it was trained on. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_185.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac1bcadbd2c1cce69a45b8b31f33613eaa91f586 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_185.txt @@ -0,0 +1,6 @@ +Recently, relative positional embeddings that can tackle the above mentioned problems have become more popular, most notably: + +Rotary Position Embedding (RoPE) +ALiBi + +Both RoPE and ALiBi argue that it's best to cue the LLM about sentence order directly in the self-attention algorithm as it's there that word tokens are put into relation with each other. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_186.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..c154b5f982763c6e6530dd2ec4d9ca48e286270f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_186.txt @@ -0,0 +1 @@ +More specifically, sentence order should be cued by modifying the \( \mathbf{QK}^T \) computation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_187.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..0633d979184c3ddf90212afb6ba020fb575cfab8 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_187.txt @@ -0,0 +1 @@ +Without going into too many details, RoPE notes that positional information can be encoded into query-key pairs, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_188.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..6abab8a94df6955e6665a264d4a302e854ad182b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_188.txt @@ -0,0 +1,2 @@ +\( \mathbf{q}_i \) and \( \mathbf{x}_j \) by rotating each vector by an angle \( \theta * i \) and \( \theta * j \) respectively with \( i, j \) describing each vectors sentence position: +$$ \mathbf{\hat{q}}i^T \mathbf{\hat{x}}_j = \mathbf{{q}}_i^T \mathbf{R}{\theta, i -j} \mathbf{{x}}_j. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_189.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_189.txt new file mode 100644 index 0000000000000000000000000000000000000000..81700878221bacbf300acd8960f386680c73e303 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_189.txt @@ -0,0 +1,2 @@ +$$ +\( \mathbf{R}_{\theta, i - j} \) thereby represents a rotational matrix. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_19.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..06d66c0edb69c78551dd7b5ca54a006102a566dd --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_19.txt @@ -0,0 +1 @@ +4.5689 which is usually stored in either float32, bfloat16, or float16 format. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_190.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_190.txt new file mode 100644 index 0000000000000000000000000000000000000000..418f693216405d7d4721773e4e13e66e61038f5d --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_190.txt @@ -0,0 +1 @@ +\( \theta \) is not learned during training, but instead set to a pre-defined value that depends on the maximum input sequence length during training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_191.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_191.txt new file mode 100644 index 0000000000000000000000000000000000000000..e86a9e82bd53417e817cbea1c1a07254e524fa74 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_191.txt @@ -0,0 +1 @@ +By doing so, the propability score between \( \mathbf{q}_i \) and \( \mathbf{q}_j \) is only affected if \( i \ne j \) and solely depends on the relative distance \( i - j \) regardless of each vector's specific positions \( i \) and \( j \) . \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_192.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_192.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f3bf51d70addb4833fe04a2441f52107c679c4b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_192.txt @@ -0,0 +1,7 @@ +RoPE is used in multiple of today's most important LLMs, such as: + +Falcon +Llama +PaLM + +As an alternative, ALiBi proposes a much simpler relative position encoding scheme. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_193.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_193.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc739447122b66ee0cf74e4a69002d235594c83e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_193.txt @@ -0,0 +1 @@ +The relative distance that input tokens have to each other is added as a negative integer scaled by a pre-defined value m to each query-key entry of the \( \mathbf{QK}^T \) matrix right before the softmax computation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_194.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_194.txt new file mode 100644 index 0000000000000000000000000000000000000000..00d3aaf79f5a12075dd5f9ced3671aa2c228b179 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_194.txt @@ -0,0 +1 @@ +As shown in the ALiBi paper, this simple relative positional encoding allows the model to retain a high performance even at very long text input sequences. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_195.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_195.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e7a8a939590e9645935d74ce34047018a7ea429 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_195.txt @@ -0,0 +1,6 @@ +ALiBi is used in multiple of today's most important LLMs, such as: + +MPT +BLOOM + +Both RoPE and ALiBi position encodings can extrapolate to input lengths not seen during training whereas it has been shown that extrapolation works much better out-of-the-box for ALiBi as compared to RoPE. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_196.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_196.txt new file mode 100644 index 0000000000000000000000000000000000000000..267c61744ac4f6e95dbd032fe15f559744c28aa4 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_196.txt @@ -0,0 +1 @@ +For ALiBi, one simply increases the values of the lower triangular position matrix to match the length of the input sequence. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_197.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_197.txt new file mode 100644 index 0000000000000000000000000000000000000000..d386646e8331a1529686bd0df07b3c3975a6cd1b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_197.txt @@ -0,0 +1 @@ +For RoPE, keeping the same \( \theta \) that was used during training leads to poor results when passing text inputs much longer than those seen during training, c.f Press et al.. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_198.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_198.txt new file mode 100644 index 0000000000000000000000000000000000000000..1046a71534a784fd10640912e6c4d9992bd7cdc4 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_198.txt @@ -0,0 +1 @@ +However, the community has found a couple of effective tricks that adapt \( \theta \), thereby allowing RoPE position embeddings to work well for extrapolated text input sequences (see here). \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_199.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_199.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ee5ddecc11648b9280efa55016bcabfd62d4e87 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_199.txt @@ -0,0 +1,4 @@ +Both RoPE and ALiBi are relative positional embeddings that are not learned during training, but instead are based on the following intuitions: + - Positional cues about the text inputs should be given directly to the \( QK^T \) matrix of the self-attention layer + - The LLM should be incentivized to learn a constant relative distance positional encodings have to each other + - The further text input tokens are from each other, the lower the probability of their query-value probability. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_2.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..b91dd3f33a0ee9368592de5bb01717fa1396b92e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_2.txt @@ -0,0 +1 @@ +al). \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_20.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c1289eb711aa59c83d8add29502e96dc4c6cfba --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_20.txt @@ -0,0 +1,5 @@ +This allows us to easily compute the memory requirement to load the LLM into memory: + +Loading the weights of a model having X billion parameters requires roughly 4 * X GB of VRAM in float32 precision + +Nowadays, models are however rarely trained in full float32 precision, but usually in bfloat16 precision or less frequently in float16 precision. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_200.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_200.txt new file mode 100644 index 0000000000000000000000000000000000000000..ccbe79ff270c1f814383c4bf1348566890dc5fc4 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_200.txt @@ -0,0 +1 @@ +Both RoPE and ALiBi lower the query-key probability of tokens far away from each other. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_201.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_201.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ac7c46725c2d6b86a77e8139a20aae1fb77efa2 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_201.txt @@ -0,0 +1 @@ +RoPE by decreasing their vector product by increasing the angle between the query-key vectors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_202.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_202.txt new file mode 100644 index 0000000000000000000000000000000000000000..277a2f31a7be5eee1bbd4a226f181ca64fa86ed1 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_202.txt @@ -0,0 +1,3 @@ +ALiBi by adding large negative numbers to the vector product + +In conclusion, LLMs that are intended to be deployed in tasks that require handling large text inputs are better trained with relative positional embeddings, such as RoPE and ALiBi. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_203.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_203.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bba4e5d7dcf9e01e04bb3fd083e016f2336e283 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_203.txt @@ -0,0 +1 @@ +Also note that even if an LLM with RoPE and ALiBi has been trained only on a fixed length of say \( N_1 = 2048 \) it can still be used in practice with text inputs much larger than \( N_1 \), like \( N_2 = 8192 > N_1 \) by extrapolating the positional embeddings. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_204.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_204.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2a1eb79277e5012544fb6c1b49440ac4327a9a2 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_204.txt @@ -0,0 +1,2 @@ +3.2 The key-value cache +Auto-regressive text generation with LLMs works by iteratively putting in an input sequence, sampling the next token, appending the next token to the input sequence, and continuing to do so until the LLM produces a token that signifies that the generation has finished. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_205.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_205.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c8fcac53602d61832013ce3b67443e065202896 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_205.txt @@ -0,0 +1 @@ +Please have a look at Transformer's Generate Text Tutorial to get a more visual explanation of how auto-regressive generation works. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_206.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_206.txt new file mode 100644 index 0000000000000000000000000000000000000000..37c61cf54ef32e3febfc1e46e5069c5ae8c7c179 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_206.txt @@ -0,0 +1 @@ +Let's run a quick code snippet to show how auto-regressive works in practice. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_207.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_207.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1108e6b683de6a4bc6b79ee6cd2b388dd7c9e6e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_207.txt @@ -0,0 +1 @@ +We will simply take the most likely next token via torch.argmax. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_208.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_208.txt new file mode 100644 index 0000000000000000000000000000000000000000..28465a6e3a918c6319a9140525ad18b132625488 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_208.txt @@ -0,0 +1,18 @@ +thon +input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda") +for _ in range(5): + next_logits = model(input_ids)["logits"][:, -1:] + next_token_id = torch.argmax(next_logits,dim=-1) +input_ids = torch.cat([input_ids, next_token_id], dim=-1) + print("shape of input_ids", input_ids.shape) +generated_text = tokenizer.batch_decode(input_ids[:, -5:]) +generated_text + +Output: +shape of input_ids torch.Size([1, 21]) +shape of input_ids torch.Size([1, 22]) +shape of input_ids torch.Size([1, 23]) +shape of input_ids torch.Size([1, 24]) +shape of input_ids torch.Size([1, 25]) +[' Here is a Python function'] +As we can see every time we increase the text input tokens by the just sampled token. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_209.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_209.txt new file mode 100644 index 0000000000000000000000000000000000000000..608cf94bad607a96cb38861e7c453a56becf6107 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_209.txt @@ -0,0 +1 @@ +With very few exceptions, LLMs are trained using the causal language modeling objective and therefore mask the upper triangle matrix of the attention score - this is why in the two diagrams above the attention scores are left blank (a.k.a have 0 probability). \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_21.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..861f88616ea8adfc7138bf0a004a019767a18d9c --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_21.txt @@ -0,0 +1,5 @@ +Therefore the rule of thumb becomes: + +Loading the weights of a model having X billion parameters requires roughly 2 * X GB of VRAM in bfloat16/float16 precision + +For shorter text inputs (less than 1024 tokens), the memory requirement for inference is very much dominated by the memory requirement to load the weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_210.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_210.txt new file mode 100644 index 0000000000000000000000000000000000000000..66ebf58286bb953dd6ceee2a8d00c40b98049f3d --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_210.txt @@ -0,0 +1 @@ +For a quick recap on causal language modeling you can refer to the Illustrated Self Attention blog. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_211.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_211.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a3d79b9416231eca822145c4d1ed03b438d1bb9 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_211.txt @@ -0,0 +1 @@ +As a consequence, tokens never depend on previous tokens, more specifically the \( \mathbf{q}i \) vector is never put in relation with any key, values vectors \( \mathbf{k}_j, \mathbf{v}_j \) if \( j > i \) . \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_212.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_212.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f6aacbd84e61cd183e3422a597c0f5e9fe6a41c --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_212.txt @@ -0,0 +1 @@ +Instead \( \mathbf{q}_i \) only attends to previous key-value vectors \( \mathbf{k}{m < i}, \mathbf{v}_{m < i} \text{ , for } m \in {0, \ldots i - 1} \). \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_213.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_213.txt new file mode 100644 index 0000000000000000000000000000000000000000..35279c58e7b2d1eea4f22417eca8ff6354679efb --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_213.txt @@ -0,0 +1 @@ +In order to reduce unnecessary computation, one can therefore cache each layer's key-value vectors for all previous timesteps. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_214.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_214.txt new file mode 100644 index 0000000000000000000000000000000000000000..66245a944e6450415e50a20e5f3a6681b2ea7360 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_214.txt @@ -0,0 +1 @@ +In the following, we will tell the LLM to make use of the key-value cache by retrieving and forwarding it for each forward pass. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_215.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_215.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c89760069a5f747bebe338d45d2e85cde8cdba1 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_215.txt @@ -0,0 +1 @@ +In Transformers, we can retrieve the key-value cache by passing the use_cache flag to the forward call and can then pass it with the current token. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_216.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_216.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d49e991528f1e0b0087ef0f42b83849b6d61112 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_216.txt @@ -0,0 +1,27 @@ +thon +past_key_values = None # past_key_values is the key-value cache +generated_tokens = [] +next_token_id = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda") +for _ in range(5): + next_logits, past_key_values = model(next_token_id, past_key_values=past_key_values, use_cache=True).to_tuple() + next_logits = next_logits[:, -1:] + next_token_id = torch.argmax(next_logits, dim=-1) +print("shape of input_ids", next_token_id.shape) + print("length of key-value cache", len(past_key_values[0][0])) # past_key_values are of shape [num_layers, 0 for k, 1 for v, batch_size, length, hidden_dim] + generated_tokens.append(next_token_id.item()) +generated_text = tokenizer.batch_decode(generated_tokens) +generated_text + +Output: +shape of input_ids torch.Size([1, 1]) +length of key-value cache 20 +shape of input_ids torch.Size([1, 1]) +length of key-value cache 21 +shape of input_ids torch.Size([1, 1]) +length of key-value cache 22 +shape of input_ids torch.Size([1, 1]) +length of key-value cache 23 +shape of input_ids torch.Size([1, 1]) +length of key-value cache 24 +[' Here', ' is', ' a', ' Python', ' function'] +As one can see, when using the key-value cache the text input tokens are not increased in length, but remain a single input vector. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_217.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_217.txt new file mode 100644 index 0000000000000000000000000000000000000000..0006a26bec06b67c1591af7d2da76885027349ab --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_217.txt @@ -0,0 +1 @@ +The length of the key-value cache on the other hand is increased by one at every decoding step. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_218.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_218.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc0d802a320c21d5aedceb9fb97f9698f696bfeb --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_218.txt @@ -0,0 +1 @@ +Making use of the key-value cache means that the \( \mathbf{QK}^T \) is essentially reduced to \( \mathbf{q}_c\mathbf{K}^T \) with \( \mathbf{q}_c \) being the query projection of the currently passed input token which is always just a single vector. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_219.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_219.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d5441ce8b16aac7966b380557195197e2dab25d --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_219.txt @@ -0,0 +1,2 @@ +Using the key-value cache has two advantages: +- Significant increase in computational efficiency as less computations are performed compared to computing the full \( \mathbf{QK}^T \) matrix. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_22.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..23ecbbea2cae22b061937f3960eeda82cdeece56 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_22.txt @@ -0,0 +1 @@ +Therefore, for now, let's assume that the memory requirement for inference is equal to the memory requirement to load the model into the GPU VRAM. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_220.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_220.txt new file mode 100644 index 0000000000000000000000000000000000000000..4909770c637abb667137faaededa8bd8314d7051 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_220.txt @@ -0,0 +1,2 @@ +This leads to an increase in inference speed +- The maximum required memory is not increased quadratically with the number of generated tokens, but only increases linearly. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_221.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_221.txt new file mode 100644 index 0000000000000000000000000000000000000000..267e2fda0c9cdc8b054d2ccbf6df49865abf1c5c --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_221.txt @@ -0,0 +1 @@ +One should always make use of the key-value cache as it leads to identical results and a significant speed-up for longer input sequences. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_222.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_222.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c0f4da467cba6c29bb4302e5583f5622acd4cea --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_222.txt @@ -0,0 +1 @@ +Transformers has the key-value cache enabled by default when making use of the text pipeline or the generate method. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_223.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_223.txt new file mode 100644 index 0000000000000000000000000000000000000000..12656a3f559ab45c4ca14b728493f91d6b73a152 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_223.txt @@ -0,0 +1 @@ +Note that, despite our advice to use key-value caches, your LLM output may be slightly different when you use them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_224.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_224.txt new file mode 100644 index 0000000000000000000000000000000000000000..1963f6ae114698c64bb44ca641fa0550c875f4f1 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_224.txt @@ -0,0 +1 @@ +This is a property of the matrix multiplication kernels themselves -- you can read more about it here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_225.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_225.txt new file mode 100644 index 0000000000000000000000000000000000000000..92ba9eaa01a152489781b1ed0bdd0fb9660e6531 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_225.txt @@ -0,0 +1,2 @@ +3.2.1 Multi-round conversation +The key-value cache is especially useful for applications such as chat where multiple passes of auto-regressive decoding are required. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_226.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_226.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd8c2e67fb79273190177f3c472e857ac1dbfe70 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_226.txt @@ -0,0 +1 @@ +Let's look at an example. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_227.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_227.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ab60f89371d349d37a9bcc43e18ee92237a4b60 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_227.txt @@ -0,0 +1 @@ +User: How many people live in France? \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_228.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_228.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4b6aaf684df5ff49d2bfe8c4acebf86c53498c1 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_228.txt @@ -0,0 +1,2 @@ +Assistant: Roughly 75 million people live in France +User: And how many are in Germany? \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_229.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_229.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd664c2d626f8337c8fbcda52e3e3aa0c19fca3d --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_229.txt @@ -0,0 +1 @@ +Assistant: Germany has ca. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_23.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cc975716070c6f352c23c229ca6c6c994f42ebf --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_23.txt @@ -0,0 +1,10 @@ +To give some examples of how much VRAM it roughly takes to load a model in bfloat16: + +GPT3 requires 2 * 175 GB = 350 GB VRAM +Bloom requires 2 * 176 GB = 352 GB VRAM +Llama-2-70b requires 2 * 70 GB = 140 GB VRAM +Falcon-40b requires 2 * 40 GB = 80 GB VRAM +MPT-30b requires 2 * 30 GB = 60 GB VRAM +bigcode/starcoder requires 2 * 15.5 = 31 GB VRAM + +As of writing this document, the largest GPU chip on the market is the A100 & H100 offering 80GB of VRAM. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_230.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_230.txt new file mode 100644 index 0000000000000000000000000000000000000000..fade0102d75c4ae110bee71c9b742047e377c00f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_230.txt @@ -0,0 +1,3 @@ +81 million inhabitants +In this chat, the LLM runs auto-regressive decoding twice: + 1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_231.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_231.txt new file mode 100644 index 0000000000000000000000000000000000000000..d69c345b1a1ac06cb611d79336a5df429b547ba0 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_231.txt @@ -0,0 +1 @@ +The first time, the key-value cache is empty and the input prompt is "User: How many people live in France?" \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_232.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_232.txt new file mode 100644 index 0000000000000000000000000000000000000000..438dd8d431de2dd8a172888c2633e45558057211 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_232.txt @@ -0,0 +1 @@ +and the model auto-regressively generates the text "Roughly 75 million people live in France" while increasing the key-value cache at every decoding step. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_233.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_233.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_233.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_234.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_234.txt new file mode 100644 index 0000000000000000000000000000000000000000..65b967f38653679dc76e8f91c81a0ddaa212c465 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_234.txt @@ -0,0 +1 @@ +The second time the input prompt is "User: How many people live in France? \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_235.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_235.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9131c65faff2d226989b2827f911a229deda8a6 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_235.txt @@ -0,0 +1 @@ +\n Assistant: Roughly 75 million people live in France \n User: And how many in Germany?". \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_236.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_236.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7b6ecb26bf4e8fe207ac3da50c8554350e0b70b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_236.txt @@ -0,0 +1 @@ +Thanks to the cache, all key-value vectors for the first two sentences are already computed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_237.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_237.txt new file mode 100644 index 0000000000000000000000000000000000000000..84184044ecbcf6543b87577240eebc24099be244 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_237.txt @@ -0,0 +1 @@ +Therefore the input prompt only consists of "User: And how many in Germany?". \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_238.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_238.txt new file mode 100644 index 0000000000000000000000000000000000000000..7486f49b52fa6ea9342fac877b123c63e3b10cf7 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_238.txt @@ -0,0 +1 @@ +While processing the shortened input prompt, it's computed key-value vectors are concatenated to the key-value cache of the first decoding. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_239.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_239.txt new file mode 100644 index 0000000000000000000000000000000000000000..38d28ce0fb24708edfba26e7d8edcb72a0240c3b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_239.txt @@ -0,0 +1 @@ +The second Assistant's answer "Germany has ca. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_24.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c2667b6e6d0f960ac3b59d902cab1ee642c8e7b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_24.txt @@ -0,0 +1 @@ +Most of the models listed before require more than 80GB just to be loaded and therefore necessarily require tensor parallelism and/or pipeline parallelism. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_240.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_240.txt new file mode 100644 index 0000000000000000000000000000000000000000..a05d9b0998c391449ebc98c45aab6326c263b386 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_240.txt @@ -0,0 +1 @@ +81 million inhabitants" is then auto-regressively generated with the key-value cache consisting of encoded key-value vectors of "User: How many people live in France? \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_241.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_241.txt new file mode 100644 index 0000000000000000000000000000000000000000..0a204873b1ae96781f970fe2fe8b96aa0836ead7 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_241.txt @@ -0,0 +1 @@ +\n Assistant: Roughly 75 million people live in France \n User: And how many are in Germany?". \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_242.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_242.txt new file mode 100644 index 0000000000000000000000000000000000000000..712b36593f4554fa192fea1199794abbff99d7f3 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_242.txt @@ -0,0 +1,2 @@ +Two things should be noted here: + 1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_243.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_243.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c89e0e6bfe3625e3450fa124b584e7874eb1aec --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_243.txt @@ -0,0 +1 @@ +Keeping all the context is crucial for LLMs deployed in chat so that the LLM understands all the previous context of the conversation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_244.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_244.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c6c42b6981712ccfb35860e2422a687ff0e1372 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_244.txt @@ -0,0 +1 @@ +E.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_245.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_245.txt new file mode 100644 index 0000000000000000000000000000000000000000..25404fc410b8ac8a6de661bb85eb591bf3a44c53 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_245.txt @@ -0,0 +1 @@ +for the example above the LLM needs to understand that the user refers to the population when asking "And how many are in Germany". \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_246.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_246.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_246.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_247.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_247.txt new file mode 100644 index 0000000000000000000000000000000000000000..461479c7fc57b18110f8c7b44de4e4c76a955eac --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_247.txt @@ -0,0 +1 @@ +The key-value cache is extremely useful for chat as it allows us to continuously grow the encoded chat history instead of having to re-encode the chat history again from scratch (as e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_248.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_248.txt new file mode 100644 index 0000000000000000000000000000000000000000..b63c3d4626380267bbb62c5433b85f521c7fdcbe --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_248.txt @@ -0,0 +1 @@ +would be the case when using an encoder-decoder architecture). \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_249.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_249.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a6ad0542e8acce3480e5fe1853a6b4e378aede4 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_249.txt @@ -0,0 +1 @@ +In transformers, a generate call will return past_key_values when return_dict_in_generate=True is passed, in addition to the default use_cache=True. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_25.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0b7406d079cf99afd62539adcd48e611a40121d --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_25.txt @@ -0,0 +1 @@ +🤗 Transformers does not support tensor parallelism out of the box as it requires the model architecture to be written in a specific way. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_250.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_250.txt new file mode 100644 index 0000000000000000000000000000000000000000..791fd32feda2dd24a60757f7bcc0ebc04f16d232 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_250.txt @@ -0,0 +1 @@ +Note that it is not yet available through the pipeline interface. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_251.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_251.txt new file mode 100644 index 0000000000000000000000000000000000000000..a26f24633a39942017647be376a9eaf3bccf37b3 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_251.txt @@ -0,0 +1,20 @@ +thon +Generation as usual +prompt = system_prompt + "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here" +model_inputs = tokenizer(prompt, return_tensors='pt') +generation_output = model.generate(**model_inputs, max_new_tokens=60, return_dict_in_generate=True) +decoded_output = tokenizer.batch_decode(generation_output.sequences)[0] +Piping the returned past_key_values to speed up the next conversation round +prompt = decoded_output + "\nQuestion: How can I modify the function above to return Mega bytes instead?\n\nAnswer: Here" +model_inputs = tokenizer(prompt, return_tensors='pt') +generation_output = model.generate( + **model_inputs, + past_key_values=generation_output.past_key_values, + max_new_tokens=60, + return_dict_in_generate=True +) +tokenizer.batch_decode(generation_output.sequences)[0][len(prompt):] + +Output: + + is a modified version of the function that returns Mega bytes instead. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_252.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_252.txt new file mode 100644 index 0000000000000000000000000000000000000000..41e01667498fee602211ef9c05ac5c48b91a43d6 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_252.txt @@ -0,0 +1,5 @@ +def bytes_to_megabytes(bytes): + return bytes / 1024 / 1024 +Answer: The function takes a number of bytes as input and returns the number of + +Great, no additional time is spent recomputing the same key and values for the attention layer! \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_253.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_253.txt new file mode 100644 index 0000000000000000000000000000000000000000..09b83c30c80f299e1a2287c4d13d94726512a38f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_253.txt @@ -0,0 +1 @@ +There is however one catch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_254.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_254.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b52d451a0db23331837894f8fc4808cf1c8e9ff --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_254.txt @@ -0,0 +1 @@ +While the required peak memory for the \( \mathbf{QK}^T \) matrix is significantly reduced, holding the key-value cache in memory can become very memory expensive for long input sequences or multi-turn chat. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_255.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_255.txt new file mode 100644 index 0000000000000000000000000000000000000000..4577c35db62ced42f8e75056fd351ba380ddd404 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_255.txt @@ -0,0 +1 @@ +Remember that the key-value cache needs to store the key-value vectors for all previous input vectors \( \mathbf{x}_i \text{, for } i \in {1, \ldots, c - 1} \) for all self-attention layers and for all attention heads. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_256.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_256.txt new file mode 100644 index 0000000000000000000000000000000000000000..6780d54882d67ae87e44dfd549fe9f2931b6451a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_256.txt @@ -0,0 +1 @@ +Let's compute the number of float values that need to be stored in the key-value cache for the LLM bigcode/octocoder that we used before. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_257.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_257.txt new file mode 100644 index 0000000000000000000000000000000000000000..de9ead2d291f6966f6ed0d769d6b97e7e6d44335 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_257.txt @@ -0,0 +1 @@ +The number of float values amounts to two times the sequence length times the number of attention heads times the attention head dimension and times the number of layers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_258.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_258.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c9f6240eaedf277aadfa69e0b535ed1227f7a32 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_258.txt @@ -0,0 +1,7 @@ +Computing this for our LLM at a hypothetical input sequence length of 16000 gives: +python +config = model.config +2 * 16_000 * config.n_layer * config.n_head * config.n_embd // config.n_head +Output: +7864320000 +Roughly 8 billion float values! \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_259.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_259.txt new file mode 100644 index 0000000000000000000000000000000000000000..ad923555bcdc36adc902d8d69cc071820fa879d5 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_259.txt @@ -0,0 +1 @@ +Storing 8 billion float values in float16 precision requires around 15 GB of RAM which is circa half as much as the model weights themselves! \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_26.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..256f0319ea92716924c02778c0e6ee94be222ebb --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_26.txt @@ -0,0 +1 @@ +If you're interested in writing models in a tensor-parallelism-friendly way, feel free to have a look at the text-generation-inference library. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_260.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_260.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6e62710b1ef99c785bf2b06475642a129cc370a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_260.txt @@ -0,0 +1 @@ +Researchers have proposed two methods that allow to significantly reduce the memory cost of storing the key-value cache, which are explored in the next subsections. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_261.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_261.txt new file mode 100644 index 0000000000000000000000000000000000000000..c08a28a4528aa9a3049384218e44833103f9861e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_261.txt @@ -0,0 +1,2 @@ +3.2.2 Multi-Query-Attention (MQA) +Multi-Query-Attention was proposed in Noam Shazeer's Fast Transformer Decoding: One Write-Head is All You Need paper. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_262.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_262.txt new file mode 100644 index 0000000000000000000000000000000000000000..e82a556c7f9ecdb17b72de5f9bab2eb9c8fd717b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_262.txt @@ -0,0 +1 @@ +As the title says, Noam found out that instead of using n_head key-value projections weights, one can use a single head-value projection weight pair that is shared across all attention heads without that the model's performance significantly degrades. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_263.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_263.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea57ade8b93978444d6e8c827b7f23663b4c986e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_263.txt @@ -0,0 +1 @@ +By using a single head-value projection weight pair, the key value vectors \( \mathbf{k}_i, \mathbf{v}_i \) have to be identical across all attention heads which in turn means that we only need to store 1 key-value projection pair in the cache instead of n_head ones. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_264.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_264.txt new file mode 100644 index 0000000000000000000000000000000000000000..3604e1391b318e694f3a85c25c565cfc5d3328ad --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_264.txt @@ -0,0 +1 @@ +As most LLMs use between 20 and 100 attention heads, MQA significantly reduces the memory consumption of the key-value cache. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_265.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_265.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6ae2be67c8f54b1495037959d4c4b30b4bba0d1 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_265.txt @@ -0,0 +1 @@ +For the LLM used in this notebook we could therefore reduce the required memory consumption from 15 GB to less than 400 MB at an input sequence length of 16000. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_266.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_266.txt new file mode 100644 index 0000000000000000000000000000000000000000..32166f76ceb4d86f7f8e7e6eb329a696f01beec3 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_266.txt @@ -0,0 +1 @@ +In addition to memory savings, MQA also leads to improved computational efficiency as explained in the following. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_267.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_267.txt new file mode 100644 index 0000000000000000000000000000000000000000..014b349364a8b5bd88df667d880ca26f85d50fd1 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_267.txt @@ -0,0 +1 @@ +In auto-regressive decoding, large key-value vectors need to be reloaded, concatenated with the current key-value vector pair to be then fed into the \( \mathbf{q}_c\mathbf{K}^T \) computation at every step. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_268.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_268.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca17e261b9a78b722ff5db86a21b954e30f36e37 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_268.txt @@ -0,0 +1 @@ +For auto-regressive decoding, the required memory bandwidth for the constant reloading can become a serious time bottleneck. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_269.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_269.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b644418424aed53d2a183240748354276e7014d --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_269.txt @@ -0,0 +1 @@ +By reducing the size of the key-value vectors less memory needs to be accessed, thus reducing the memory bandwidth bottleneck. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_27.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..794a357af9a662d92c66ef0d4b57ea3d8b1bd7ae --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_27.txt @@ -0,0 +1 @@ +Naive pipeline parallelism is supported out of the box. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_270.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_270.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5d3c894bea7888fcb562812ca0d675970fba6e6 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_270.txt @@ -0,0 +1 @@ +For more detail, please have a look at Noam's paper. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_271.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_271.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbe7940e8f0f00d8b5bc49c896745546912f5688 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_271.txt @@ -0,0 +1 @@ +The important part to understand here is that reducing the number of key-value attention heads to 1 only makes sense if a key-value cache is used. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_272.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_272.txt new file mode 100644 index 0000000000000000000000000000000000000000..12b1e76a8781d38c33ac202f4cd3cc9a1b9c5b09 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_272.txt @@ -0,0 +1 @@ +The peak memory consumption of the model for a single forward pass without key-value cache stays unchanged as every attention head still has a unique query vector so that each attention head still has a different \( \mathbf{QK}^T \) matrix. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_273.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_273.txt new file mode 100644 index 0000000000000000000000000000000000000000..6b4188f34c9a85b92d0ca4ae0698b3b68e2d2878 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_273.txt @@ -0,0 +1,8 @@ +MQA has seen wide adoption by the community and is now used by many of the most popular LLMs: + +Falcon +PaLM +MPT +BLOOM + +Also, the checkpoint used in this notebook - bigcode/octocoder - makes use of MQA. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_274.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_274.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9163d7f72ee223157db7d4a4c85ffb8835c94c6 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_274.txt @@ -0,0 +1,2 @@ +3.2.3 Grouped-Query-Attention (GQA) +Grouped-Query-Attention, as proposed by Ainslie et al. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_275.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_275.txt new file mode 100644 index 0000000000000000000000000000000000000000..27314004f663280778c99913f291012be0dd5f8f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_275.txt @@ -0,0 +1 @@ +from Google, found that using MQA can often lead to quality degradation compared to using vanilla multi-key-value head projections. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_276.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_276.txt new file mode 100644 index 0000000000000000000000000000000000000000..176da623b9b06783ad0119be0060db6a003b9709 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_276.txt @@ -0,0 +1 @@ +The paper argues that more model performance can be kept by less drastically reducing the number of query head projection weights. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_277.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_277.txt new file mode 100644 index 0000000000000000000000000000000000000000..4253b0541230b17dbf11b1878c8f43ec3489b332 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_277.txt @@ -0,0 +1 @@ +Instead of using just a single key-value projection weight, n < n_head key-value projection weights should be used. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_278.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_278.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5a227a7a131e74b66a06c28cc80226326530991 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_278.txt @@ -0,0 +1 @@ +By choosing n to a significantly smaller value than n_head, such as 2,4 or 8 almost all of the memory and speed gains from MQA can be kept while sacrificing less model capacity and thus arguably less performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_279.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_279.txt new file mode 100644 index 0000000000000000000000000000000000000000..098789122ffde43de6867c704c88ce3d4f179fbc --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_279.txt @@ -0,0 +1 @@ +Moreover, the authors of GQA found out that existing model checkpoints can be uptrained to have a GQA architecture with as little as 5% of the original pre-training compute. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_28.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..342e100c1dc2ab00df128f0cb3fee5736b236fdd --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_28.txt @@ -0,0 +1 @@ +For this, simply load the model with device="auto" which will automatically place the different layers on the available GPUs as explained here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_280.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_280.txt new file mode 100644 index 0000000000000000000000000000000000000000..64464d38f6f2659d289e6027a6293ce21445a7d7 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_280.txt @@ -0,0 +1 @@ +While 5% of the original pre-training compute can still be a massive amount, GQA uptraining allows existing checkpoints to be useful for longer input sequences. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_281.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_281.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c1003430cdd1bc0ae649bb865a59929f3a754cd --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_281.txt @@ -0,0 +1 @@ +GQA was only recently proposed which is why there is less adoption at the time of writing this notebook. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_282.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_282.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbe78cb48af42df08124a764a625ff817c6923b2 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_282.txt @@ -0,0 +1 @@ +The most notable application of GQA is Llama-v2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_283.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_283.txt new file mode 100644 index 0000000000000000000000000000000000000000..32fd9bc17386098dda83b0c8e00710c2e42d10e2 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_283.txt @@ -0,0 +1 @@ +As a conclusion, it is strongly recommended to make use of either GQA or MQA if the LLM is deployed with auto-regressive decoding and is required to handle large input sequences as is the case for example for chat. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_284.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_284.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ce15e9893c3a39ec280d2e4e7ab22e09b4eba8d --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_284.txt @@ -0,0 +1,2 @@ +Conclusion +The research community is constantly coming up with new, nifty ways to speed up inference time for ever-larger LLMs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_285.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_285.txt new file mode 100644 index 0000000000000000000000000000000000000000..19c14f3a003b2e19852198cff8ef3e4c0fc60cb1 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_285.txt @@ -0,0 +1 @@ +As an example, one such promising research direction is speculative decoding where "easy tokens" are generated by smaller, faster language models and only "hard tokens" are generated by the LLM itself. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_286.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_286.txt new file mode 100644 index 0000000000000000000000000000000000000000..698b0b8a89a301321542c87c5da0e25e6f53ca13 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_286.txt @@ -0,0 +1 @@ +Going into more detail is out of the scope of this notebook, but can be read upon in this nice blog post. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_287.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_287.txt new file mode 100644 index 0000000000000000000000000000000000000000..e056dcda231d62b426b7cc26d5a1d858153d582c --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_287.txt @@ -0,0 +1 @@ +The reason massive LLMs such as GPT3/4, Llama-2-70b, Claude, PaLM can run so quickly in chat-interfaces such as Hugging Face Chat or ChatGPT is to a big part thanks to the above-mentioned improvements in precision, algorithms, and architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_288.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_288.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6bca2da77cb58efd04256f506333bb63da76d13 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_288.txt @@ -0,0 +1 @@ +Going forward, accelerators such as GPUs, TPUs, etc will only get faster and allow for more memory, but one should nevertheless always make sure to use the best available algorithms and architectures to get the most bang for your buck 🤗 \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_29.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..262edb4c764636ad6116d62b7bb5444bf288488f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_29.txt @@ -0,0 +1 @@ +Note, however that while very effective, this naive pipeline parallelism does not tackle the issues of GPU idling. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_3.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..913ae74e9b32c821d240f05cf6d72f8bc4b3552e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_3.txt @@ -0,0 +1 @@ +This consequently amplifies the memory demands for inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_30.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..7eece8b5092bd7c04405b28acecc1e4a616c8f4d --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_30.txt @@ -0,0 +1 @@ +For this more advanced pipeline parallelism is required as explained here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_31.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..198da3bf1515a11a37195f2d968fc86e8494b074 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_31.txt @@ -0,0 +1,8 @@ +If you have access to an 8 x 80GB A100 node, you could load BLOOM as follows + +!pip install transformers accelerate bitsandbytes optimum +thon +from transformers import AutoModelForCausalLM +model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", device_map="auto", pad_token_id=0) + +By using device_map="auto" the attention layers would be equally distributed over all available GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_32.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..14a9365d88d6cf3934b0047fb3a67d96a1e892f4 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_32.txt @@ -0,0 +1 @@ +In this guide, we will use bigcode/octocoder as it can be run on a single 40 GB A100 GPU device chip. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_33.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9339785f92b946bb756a06b9e9b505410f153fd --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_33.txt @@ -0,0 +1 @@ +Note that all memory and speed optimizations that we will apply going forward, are equally applicable to models that require model or tensor parallelism. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_34.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d1f379fd23ebfc529e40e12743fa7749cd965da --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_34.txt @@ -0,0 +1 @@ +Since the model is loaded in bfloat16 precision, using our rule of thumb above, we would expect the memory requirement to run inference with bigcode/octocoder to be around 31 GB VRAM. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_35.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..16cb0deba15b84050c7085ae033edc823a3f2148 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_35.txt @@ -0,0 +1 @@ +Let's give it a try. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_36.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..b35dcd4dcfba7b2ff7a2a5060097869c4f1ddc09 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_36.txt @@ -0,0 +1 @@ +We first load the model and tokenizer and then pass both to Transformers' pipeline object. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_37.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ed81f92421d6518d5324fbf7a55730ddbd754e0 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_37.txt @@ -0,0 +1,15 @@ +thon +from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline +import torch +model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto", pad_token_id=0) +tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder") +pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) + +thon +prompt = "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer:" +result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):] +result + +Output: +Here is a Python function that transforms bytes to Giga bytes:\n\npython\ndef bytes_to_giga_bytes(bytes):\n return bytes / 1024 / 1024 / 1024\n\n\nThis function takes a single +Nice, we can now directly use the result to convert bytes into Gigabytes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_38.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2ff1c7bbc742dcc44dd0ec374bfc05c310930ed --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_38.txt @@ -0,0 +1,4 @@ +python +def bytes_to_giga_bytes(bytes): + return bytes / 1024 / 1024 / 1024 +Let's call torch.cuda.max_memory_allocated to measure the peak GPU memory allocation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_39.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c298588157d50fb58548c83624de05f51551f1e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_39.txt @@ -0,0 +1,6 @@ +python +bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) +Output: + +29.0260648727417 +Close enough to our back-of-the-envelope computation! \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_4.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..15d4e0c5cb9614bd5528da1ae84ff50c68481eaf --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_4.txt @@ -0,0 +1 @@ +In many real-world tasks, LLMs need to be given extensive contextual information. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_40.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..76b87f70226bcd9cee1e8e3860fc5b69c201cfbf --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_40.txt @@ -0,0 +1 @@ +We can see the number is not exactly correct as going from bytes to kilobytes requires a multiplication of 1024 instead of 1000. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_41.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..f16633ee3b71fbde93bbc5c87abf0d669c0472a1 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_41.txt @@ -0,0 +1 @@ +Therefore the back-of-the-envelope formula can also be understood as an "at most X GB" computation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_42.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b17a637669f134756b43112fae59bf117f2d7b6 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_42.txt @@ -0,0 +1 @@ +Note that if we had tried to run the model in full float32 precision, a whopping 64 GB of VRAM would have been required. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_43.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..a33d3dac4c32dce50dceb92a562ca75008ce5405 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_43.txt @@ -0,0 +1 @@ +Almost all models are trained in bfloat16 nowadays, there is no reason to run the model in full float32 precision if your GPU supports bfloat16. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_44.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf5981123550e3132d7ec8a0a6b60f767bcd3eea --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_44.txt @@ -0,0 +1 @@ +Float32 won't give better inference results than the precision that was used to train the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_45.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bc6f3e101ae01815c6fdb3bcfca82e6ec551db3 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_45.txt @@ -0,0 +1 @@ +If you are unsure in which format the model weights are stored on the Hub, you can always look into the checkpoint's config under "torch_dtype", e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_46.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..da4880b7f158d72f114b1f044ce9533c64c2ab6f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_46.txt @@ -0,0 +1 @@ +here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_47.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..60a4d92a760ac2a5b1e6401f649027f250c870f9 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_47.txt @@ -0,0 +1 @@ +It is recommended to set the model to the same precision type as written in the config when loading with from_pretrained(, torch_dtype=) except when the original type is float32 in which case one can use both float16 or bfloat16 for inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_48.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbdaa061b49215816f8f372ad90ea9dd46ecf662 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_48.txt @@ -0,0 +1 @@ +Let's define a flush() function to free all allocated memory so that we can accurately measure the peak allocated GPU memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_49.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..d566f1f58203f8edb9935fec02066a04e698d0a3 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_49.txt @@ -0,0 +1,11 @@ +thon +del pipe +del model +import gc +import torch +def flush(): + gc.collect() + torch.cuda.empty_cache() + torch.cuda.reset_peak_memory_stats() + +Let's call it now for the next experiment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_5.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..16cfa6f073abd331d76a591ea2a0d21e925a9fa6 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_5.txt @@ -0,0 +1 @@ +This necessitates the model's capability to manage very long input sequences during inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_50.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1209288637cd1498592a9befea42eb1667829a3 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_50.txt @@ -0,0 +1,9 @@ +python +flush() +In the recent version of the accelerate library, you can also use an utility method called release_memory() +thon +from accelerate.utils import release_memory + +release_memory(model) + +Now what if your GPU does not have 32 GB of VRAM? \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_51.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e989638ba2f9e09e992782954a7d4ec166489df --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_51.txt @@ -0,0 +1 @@ +It has been found that model weights can be quantized to 8-bit or 4-bits without a significant loss in performance (see Dettmers et al.). \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_52.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3cfb2e66efd7b3db60de5de3122c619b727aca4 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_52.txt @@ -0,0 +1 @@ +Model can be quantized to even 3 or 2 bits with an acceptable loss in performance as shown in the recent GPTQ paper 🤯. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_53.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fb34cc364a62e1273d68c5ca0bfddb1ecdacf3a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_53.txt @@ -0,0 +1 @@ +Without going into too many details, quantization schemes aim at reducing the precision of weights while trying to keep the model's inference results as accurate as possible (a.k.a as close as possible to bfloat16). \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_54.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7075263ca0d3e7da9876311fac862ba9b6b6e5e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_54.txt @@ -0,0 +1 @@ +Note that quantization works especially well for text generation since all we care about is choosing the set of most likely next tokens and don't really care about the exact values of the next token logit distribution. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_55.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fa9a73799a1130abd8ff8a3c4e4e2eb2c992b38 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_55.txt @@ -0,0 +1 @@ +All that matters is that the next token logit distribution stays roughly the same so that an argmax or topk operation gives the same results. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_56.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0be2acc070f1237c5b5b3edad0af3eac83fd127 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_56.txt @@ -0,0 +1,13 @@ +There are various quantization techniques, which we won't discuss in detail here, but in general, all quantization techniques work as follows: + +Quantize all weights to the target precision + +Load the quantized weights, and pass the input sequence of vectors in bfloat16 precision + +Dynamically dequantize weights to bfloat16 to perform the computation with their input vectors in bfloat16 precision + +In a nutshell, this means that inputs-weight matrix multiplications, with \( X \) being the inputs, \( W \) being a weight matrix and \( Y \) being the output: +$$ Y = X * W $$ +are changed to +$$ Y = X * \text{dequantize}(W) $$ +for every matrix multiplication. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_57.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..7572a0a9570b425a9e65c015568de8966b4e32e3 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_57.txt @@ -0,0 +1 @@ +Dequantization and re-quantization is performed sequentially for all weight matrices as the inputs run through the network graph. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_58.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..76e410603491564d04c3421e2870be8f61dd25a5 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_58.txt @@ -0,0 +1 @@ +Therefore, inference time is often not reduced when using quantized weights, but rather increases. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_59.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..527a952e491bc28f72c7af7a303b592e853bbe7e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_59.txt @@ -0,0 +1 @@ +Enough theory, let's give it a try! \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_6.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..04ff9b356d58df690176832d7da8e7b82e23a704 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_6.txt @@ -0,0 +1 @@ +The crux of these challenges lies in augmenting the computational and memory capabilities of LLMs, especially when handling expansive input sequences. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_60.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f2e274630d35aeae496fe03d02b5c02c23b144a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_60.txt @@ -0,0 +1,2 @@ +To quantize the weights with Transformers, you need to make sure that +the bitsandbytes library is installed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_61.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..19e771b6590ca2dfb1024d889b7bbfb67b53d436 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_61.txt @@ -0,0 +1,2 @@ +!pip install bitsandbytes +We can then load models in 8-bit quantization by simply adding a load_in_8bit=True flag to from_pretrained. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_62.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee3000de0e258cfba132304b1206eba7709d542f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_62.txt @@ -0,0 +1,3 @@ +python +model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_8bit=True, pad_token_id=0) +Now, let's run our example again and measure the memory usage. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_63.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fa5c860b35f6496c497550a15b16980951f5c1a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_63.txt @@ -0,0 +1,8 @@ +thon +pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) +result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):] +result + +Output: +Here is a Python function that transforms bytes to Giga bytes:\n\npython\ndef bytes_to_giga_bytes(bytes):\n return bytes / 1024 / 1024 / 1024\n\n\nThis function takes a single +Nice, we're getting the same result as before, so no loss in accuracy! \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_64.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..970c09ded9d34edbc24f10008d87acc8deaa600b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_64.txt @@ -0,0 +1 @@ +Let's look at how much memory was used this time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_65.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..8634a79e3a2f7e53580419830963dedc0776b8b7 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_65.txt @@ -0,0 +1,5 @@ +python +bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) +Output: +15.219234466552734 +Significantly less! \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_66.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..802a5737de0ecf8ac3a89f5143f7a485d32c6d9a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_66.txt @@ -0,0 +1 @@ +We're down to just a bit over 15 GBs and could therefore run this model on consumer GPUs like the 4090. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_67.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..99440c9d5c00fed8d05eaf261bf0e2dd34c53836 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_67.txt @@ -0,0 +1 @@ +We're seeing a very nice gain in memory efficiency and more or less no degradation to the model's output. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_68.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..98e028c31ab27dd1cfa100621f8c6c284cc85d97 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_68.txt @@ -0,0 +1 @@ +However, we can also notice a slight slow-down during inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_69.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..a24eb0a1eadca83e59cc61d68ac44db9d644fa91 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_69.txt @@ -0,0 +1 @@ +We delete the models and flush the memory again. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_7.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..f570f37c7a73b67fa8eae5ecaf7e2de67303ea07 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_7.txt @@ -0,0 +1,3 @@ +In this guide, we will go over the effective techniques for efficient LLM deployment: + +Lower Precision: Research has shown that operating at reduced numerical precision, namely 8-bit and 4-bit can achieve computational advantages without a considerable decline in model performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_70.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fb8c154586310da180ed707b2a415c00e314c55 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_70.txt @@ -0,0 +1,6 @@ +python +del model +del pipe +python +flush() +Let's see what peak GPU memory consumption 4-bit quantization gives. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_71.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..671d18908034b2cfa2dbe56a66aae81c63f8d9fb --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_71.txt @@ -0,0 +1 @@ +Quantizing the model to 4-bit can be done with the same API as before - this time by passing load_in_4bit=True instead of load_in_8bit=True. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_72.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..d0d11b449bb5f17f50ad8e26873913b7de504477 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_72.txt @@ -0,0 +1,9 @@ +thon +model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0) +pipe = pipeline("text-generation", model=model, tokenizer=tokenizer) +result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):] +result + +Output: +Here is a Python function that transforms bytes to Giga bytes:\n\n\ndef bytes_to_gigabytes(bytes):\n return bytes / 1024 / 1024 / 1024\n\n\nThis function takes a single argument +We're almost seeing the same output text as before - just the python is missing just before the code snippet. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_73.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..e271458b3b67585f62ed6db9ae574ecbe24aa053 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_73.txt @@ -0,0 +1 @@ +Let's see how much memory was required. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_74.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..3921dd205a1613a2fe5f82db3092b3a2ce07c136 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_74.txt @@ -0,0 +1,5 @@ +python +bytes_to_giga_bytes(torch.cuda.max_memory_allocated()) +Output: +9.543574333190918 +Just 9.5GB! \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_75.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..9330eed04633507d1a1ef1a45e50dde80dca6788 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_75.txt @@ -0,0 +1 @@ +That's really not a lot for a >15 billion parameter model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_76.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e8a9e0371c1141cc9f5573298e9e6b47bd9365a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_76.txt @@ -0,0 +1 @@ +While we see very little degradation in accuracy for our model here, 4-bit quantization can in practice often lead to different results compared to 8-bit quantization or full bfloat16 inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_77.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bc7b3e1f07f3257414bf6bb3cb6166c7869b684 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_77.txt @@ -0,0 +1 @@ +It is up to the user to try it out. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_78.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbf3ef11cd91f667fa53dc43ce18e32140e02fa5 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_78.txt @@ -0,0 +1 @@ +Also note that inference here was again a bit slower compared to 8-bit quantization which is due to the more aggressive quantization method used for 4-bit quantization leading to \( \text{quantize} \) and \( \text{dequantize} \) taking longer during inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_79.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..c966ea03e72973fa76e11b92c3a17613851d422d --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_79.txt @@ -0,0 +1,6 @@ +python +del model +del pipe +python +flush() +Overall, we saw that running OctoCoder in 8-bit precision reduced the required GPU VRAM from 32G GPU VRAM to only 15GB and running the model in 4-bit precision further reduces the required GPU VRAM to just a bit over 9GB. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_8.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..3151a84c5be62079ef48dc0c56d801cc74d88b4f --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_8.txt @@ -0,0 +1 @@ +Flash Attention: Flash Attention is a variation of the attention algorithm that not only provides a more memory-efficient approach but also realizes increased efficiency due to optimized GPU memory utilization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_80.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..a54cfbd6d073b14bc576088b07a46f47fbaded67 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_80.txt @@ -0,0 +1 @@ +4-bit quantization allows the model to be run on GPUs such as RTX3090, V100, and T4 which are quite accessible for most people. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_81.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..14265392e2cd937c5ea2ef19a2ad46aa89f9e7e0 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_81.txt @@ -0,0 +1 @@ +For more information on quantization and to see how one can quantize models to require even less GPU VRAM memory than 4-bit, we recommend looking into the AutoGPTQ implementation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_82.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ea57f221dbf074a7a4ceab1ed60171e32c3081e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_82.txt @@ -0,0 +1 @@ +As a conclusion, it is important to remember that model quantization trades improved memory efficiency against accuracy and in some cases inference time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_83.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..fac3e9ad6745a64f12d8e568288c89942abf493b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_83.txt @@ -0,0 +1 @@ +If GPU memory is not a constraint for your use case, there is often no need to look into quantization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_84.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..69cba49ff6a99a451910c0e113ed0c0f5ea2a845 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_84.txt @@ -0,0 +1 @@ +However many GPUs simply can't run LLMs without quantization methods and in this case, 4-bit and 8-bit quantization schemes are extremely useful tools. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_85.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..80360fffd7d61136cc89806ea19d54f98d1c158a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_85.txt @@ -0,0 +1 @@ +For more in-detail usage information, we strongly recommend taking a look at the Transformers Quantization Docs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_86.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..da3edab659bfe29055aec5347d24598135337a6a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_86.txt @@ -0,0 +1 @@ +Next, let's look into how we can improve computational and memory efficiency by using better algorithms and an improved model architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_87.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_87.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_88.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..d35c99d6b8bfcba5f1be752e762972b0a12649e1 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_88.txt @@ -0,0 +1,2 @@ +Flash Attention +Today's top-performing LLMs share more or less the same fundamental architecture that consists of feed-forward layers, activation layers, layer normalization layers, and most crucially, self-attention layers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_89.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..62afb900d6307adb5077c8b16ac45eb57e3beaf3 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_89.txt @@ -0,0 +1 @@ +Self-attention layers are central to Large Language Models (LLMs) in that they enable the model to understand the contextual relationships between input tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_9.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6fcbf40bfd6fa6f0eff96eb6cf4da2cb7e5d8b7 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_9.txt @@ -0,0 +1 @@ +Architectural Innovations: Considering that LLMs are always deployed in the same way during inference, namely autoregressive text generation with a long input context, specialized model architectures have been proposed that allow for more efficient inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_90.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..22d271d3b7055dd4518d306fec935c1c385ed418 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_90.txt @@ -0,0 +1 @@ +However, the peak GPU memory consumption for self-attention layers grows quadratically both in compute and memory complexity with number of input tokens (also called sequence length) that we denote in the following by \( N \) . \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_91.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0667b18d3a01dc2659220ebfe0ecbe5eacd752b --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_91.txt @@ -0,0 +1 @@ +While this is not really noticeable for shorter input sequences (of up to 1000 input tokens), it becomes a serious problem for longer input sequences (at around 16000 input tokens). \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_92.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..1142e84f3070f76d96646ba6551227dcf1bcec6c --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_92.txt @@ -0,0 +1 @@ +Let's take a closer look. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_93.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..7463537b41d474517b90cc4c6b0223f00f850f60 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_93.txt @@ -0,0 +1,3 @@ +The formula to compute the output \( \mathbf{O} \) of a self-attention layer for an input \( \mathbf{X} \) of length \( N \) is: +$$ \textbf{O} = \text{Attn}(\mathbf{X}) = \mathbf{V} \times \text{Softmax}(\mathbf{QK}^T) \text{ with } \mathbf{Q} = \mathbf{W}_q \mathbf{X}, \mathbf{V} = \mathbf{W}_v \mathbf{X}, \mathbf{K} = \mathbf{W}_k \mathbf{X} $$ +\( \mathbf{X} = (\mathbf{x}1, \mathbf{x}{N}) \) is thereby the input sequence to the attention layer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_94.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2a6f26bbed753eb0e3a282245a8db1d3e45c601 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_94.txt @@ -0,0 +1 @@ +The projections \( \mathbf{Q} \) and \( \mathbf{K} \) will each consist of \( N \) vectors resulting in the \( \mathbf{QK}^T \) being of size \( N^2 \) . \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_95.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..e99046cfe476a5ba7872fd0d2dc8cf1daacd079a --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_95.txt @@ -0,0 +1 @@ +LLMs usually have multiple attention heads, thus doing multiple self-attention computations in parallel. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_96.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2cc6dac582d700967f906570c860b8f094d4457 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_96.txt @@ -0,0 +1 @@ +Assuming, the LLM has 40 attention heads and runs in bfloat16 precision, we can calculate the memory requirement to store the \( \mathbf{QK^T} \) matrices to be \( 40 * 2 * N^2 \) bytes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_97.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..775affa6f43edaad9386bab8592e02f8098472e8 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_97.txt @@ -0,0 +1 @@ +For \( N=1000 \) only around 50 MB of VRAM are needed, however, for \( N=16000 \) we would need 19 GB of VRAM, and for \( N=100,000 \) we would need almost 1TB just to store the \( \mathbf{QK}^T \) matrices. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_98.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..6be32d671abe6b401db76b7827eb3a32da11d154 --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_98.txt @@ -0,0 +1 @@ +Long story short, the default self-attention algorithm quickly becomes prohibitively memory-expensive for large input contexts. \ No newline at end of file diff --git a/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_99.txt b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..227e12ea4f84227f972152f95bd918df92bb002e --- /dev/null +++ b/chunked/nltk_chunking/_llm_tutorial_optimization/chunk_99.txt @@ -0,0 +1 @@ +As LLMs improve in text comprehension and generation, they are applied to increasingly complex tasks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_0.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..1587c35432a15462c69dbb6388d4f35cec00091c --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_0.txt @@ -0,0 +1,5 @@ + +Model training anatomy +To understand performance optimization techniques that one can apply to improve efficiency of model training +speed and memory utilization, it's helpful to get familiar with how GPU is utilized during training, and how compute +intensity varies depending on an operation performed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_1.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9b25452934bcc46450a6b2daffe2dc3bccd1bce --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_1.txt @@ -0,0 +1 @@ +Let's start by exploring a motivating example of GPU utilization and the training run of a model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_10.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..776cdfb6f6c73be86611667596b348a77e5a3270 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_10.txt @@ -0,0 +1,2 @@ +However, not all free GPU memory can be used by +the user. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_11.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..67a4843451d69c227cfa853c72c83f6d490da7ab --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_11.txt @@ -0,0 +1 @@ +When a model is loaded to the GPU the kernels are also loaded, which can take up 1-2GB of memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_12.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..e2b6b1c73f6e6421cff50317734607c980942867 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_12.txt @@ -0,0 +1,2 @@ +To see how +much it is we load a tiny tensor into the GPU which triggers the kernels to be loaded as well. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_13.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..b19c6e4271b5f57243901f589886656662c6561b --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_13.txt @@ -0,0 +1,4 @@ +import torch +torch.ones((1, 1)).to("cuda") +print_gpu_utilization() +GPU memory occupied: 1343 MB. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_14.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..721c55c0e1ce6373b399f8345b547cc504dfdf29 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_14.txt @@ -0,0 +1 @@ +We see that the kernels alone take up 1.3GB of GPU memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_15.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c0735a2c48af2b37c793df6f780374f8064dcdd --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_15.txt @@ -0,0 +1 @@ +Now let's see how much space the model uses. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_16.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf2a04e14cd219182b3f70ea89465534d86346d7 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_16.txt @@ -0,0 +1,2 @@ +Load Model +First, we load the google-bert/bert-large-uncased model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_17.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac8f1a634af5a27253beb5e02eebe510297b3125 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_17.txt @@ -0,0 +1,2 @@ +We load the model weights directly to the GPU so that we can check +how much space just the weights use. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_18.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..0537ba4f7b605023cb65c8d95dc033e0debd6b9d --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_18.txt @@ -0,0 +1,4 @@ +from transformers import AutoModelForSequenceClassification +model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-large-uncased").to("cuda") +print_gpu_utilization() +GPU memory occupied: 2631 MB. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_19.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..64d9980fc92e50b41441a6b1ac4fb29b1fd80773 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_19.txt @@ -0,0 +1 @@ +We can see that the model weights alone take up 1.3 GB of GPU memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_2.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b648fea79c1c89f6aba277e03ef338ef1ecd429 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_2.txt @@ -0,0 +1,5 @@ +For the demonstration, +we'll need to install a few libraries: + +pip install transformers datasets accelerate nvidia-ml-py3 +The nvidia-ml-py3 library allows us to monitor the memory usage of the models from within Python. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_20.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c6883b8f2cff54cddc085fb48e6eebe2991ac6b --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_20.txt @@ -0,0 +1,2 @@ +The exact number depends on the specific +GPU you are using. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_21.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2671d4e6c1dbd9b0a71c574406bc0d953a46bf3 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_21.txt @@ -0,0 +1,2 @@ +Note that on newer GPUs a model can sometimes take up more space since the weights are loaded in an +optimized fashion that speeds up the usage of the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_22.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..8aaa5fa33f75e0f19f9b86fe123a638c3e5c1400 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_22.txt @@ -0,0 +1,10 @@ +Now we can also quickly check if we get the same result +as with nvidia-smi CLI: + +nvidia-smi +```bash +Tue Jan 11 08:58:05 2022 ++-----------------------------------------------------------------------------+ +| NVIDIA-SMI 460.91.03 Driver Version: 460.91.03 CUDA Version: 11.2 | +|-------------------------------+----------------------+----------------------+ +| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_23.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..d07a1402451ea27902c93ea97650b0e60fc0566e --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_23.txt @@ -0,0 +1,17 @@ +ECC | +| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | +| | | MIG M. | +|===============================+======================+======================| +| 0 Tesla V100-SXM2 On | 00000000:00:04.0 Off | 0 | +| N/A 37C P0 39W / 300W | 2631MiB / 16160MiB | 0% Default | +| | | N/A | ++-------------------------------+----------------------+----------------------+ ++-----------------------------------------------------------------------------+ +| Processes: | +| GPU GI CI PID Type Process name GPU Memory | +| ID ID Usage | +|=============================================================================| +| 0 N/A N/A 3721 C nvs/codeparrot/bin/python 2629MiB | ++-----------------------------------------------------------------------------+ + +We get the same number as before and you can also see that we are using a V100 GPU with 16GB of memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_24.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e78d218bbdb0112c68c88d2d79fe50d27621470 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_24.txt @@ -0,0 +1,2 @@ +So now we can +start training the model and see how the GPU memory consumption changes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_25.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..50d9a2bd4dd9fcf826b6e5756d4de13396b83aac --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_25.txt @@ -0,0 +1,13 @@ +First, we set up a few standard training +arguments: +py +default_args = { + "output_dir": "tmp", + "evaluation_strategy": "steps", + "num_train_epochs": 1, + "log_level": "error", + "report_to": "none", +} + +If you plan to run multiple experiments, in order to properly clear the memory between experiments, restart the Python + kernel between experiments. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_26.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..85ee8392a566722e9bf2cd6a94f0b69ebf811b13 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_26.txt @@ -0,0 +1,13 @@ +Memory utilization at vanilla training +Let's use the [Trainer] and train the model without using any GPU performance optimization techniques and a batch size of 4: + +from transformers import TrainingArguments, Trainer, logging +logging.set_verbosity_error() +training_args = TrainingArguments(per_device_train_batch_size=4, **default_args) +trainer = Trainer(model=model, args=training_args, train_dataset=ds) +result = trainer.train() +print_summary(result) + +Time: 57.82 +Samples/second: 8.86 +GPU memory occupied: 14949 MB. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_27.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c66f3db675e6dbec05506914a49b1761cb1c2af --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_27.txt @@ -0,0 +1 @@ +We see that already a relatively small batch size almost fills up our GPU's entire memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_28.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef985dd10db757b40bcb13f30a8a2b83a1be37db --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_28.txt @@ -0,0 +1,2 @@ +However, a larger batch size +can often result in faster model convergence or better end performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_29.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bd9d52d118ba17ae225cb0d6d6e8145773fac58 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_29.txt @@ -0,0 +1,2 @@ +So ideally we want to tune the batch size to our +model's needs and not to the GPU limitations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_3.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2d499c08f531ea58c5a862aa171be902e88d26b --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_3.txt @@ -0,0 +1,2 @@ +You might be familiar +with the nvidia-smi command in the terminal - this library allows to access the same information in Python directly. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_30.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c6d1e3d36fb396f497ba73c503c5151602496a9 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_30.txt @@ -0,0 +1 @@ +What's interesting is that we use much more memory than the size of the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_31.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9e6fa053d57ddee95f206f772e9f1edf65ea112 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_31.txt @@ -0,0 +1 @@ +To understand a bit better why this is the case let's have a look at a model's operations and memory needs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_32.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fd94ff3b60c962f86a3d0285825e4c491584b0b --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_32.txt @@ -0,0 +1,2 @@ +Anatomy of Model's Operations +Transformers architecture includes 3 main groups of operations grouped below by compute-intensity. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_33.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..95f5d5ed767c3711ee845f8686f44c94bfa92a53 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_33.txt @@ -0,0 +1,2 @@ +Tensor Contractions +Linear layers and components of Multi-Head Attention all do batched matrix-matrix multiplications. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_34.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a3d43ede079a1061bfe249d88de35b92085333b --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_34.txt @@ -0,0 +1 @@ +These operations are the most compute-intensive part of training a transformer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_35.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..65173c679a12a25470befdc9ebfa9b6d07b5c545 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_35.txt @@ -0,0 +1,2 @@ +Statistical Normalizations +Softmax and layer normalization are less compute-intensive than tensor contractions, and involve one or more reduction operations, the result of which is then applied via a map. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_36.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..1bdb06bdf55f244ed624551783897d2b81c9e47c --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_36.txt @@ -0,0 +1,2 @@ +Element-wise Operators +These are the remaining operators: biases, dropout, activations, and residual connections. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_37.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..278d699c48fe3bfa380ecca066198dfc7e8b5f78 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_37.txt @@ -0,0 +1 @@ +These are the least compute-intensive operations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_38.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c82b51499194063d1d88f8540dccfb7abd27426 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_38.txt @@ -0,0 +1 @@ +This knowledge can be helpful to know when analyzing performance bottlenecks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_39.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c16d95ffcbda84f46df7bc9ff0ddf55452b4d20 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_39.txt @@ -0,0 +1,3 @@ +This summary is derived from Data Movement Is All You Need: A Case Study on Optimizing Transformers 2020 +Anatomy of Model's Memory +We've seen that training the model uses much more memory than just putting the model on the GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_4.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd39f9591306c97356d040792521cc8f9dda662e --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_4.txt @@ -0,0 +1 @@ +Then, we create some dummy data: random token IDs between 100 and 30000 and binary labels for a classifier. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_40.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..52bf1e30ff6880cd9366f8f61b63f8500966f599 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_40.txt @@ -0,0 +1,2 @@ +This is because there +are many components during training that use GPU memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_41.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a3c9b181c347b1602e23a3b4da9deb8b0130081 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_41.txt @@ -0,0 +1,10 @@ +The components on GPU memory are the following: + +model weights +optimizer states +gradients +forward activations saved for gradient computation +temporary buffers +functionality-specific memory + +A typical model trained in mixed precision with AdamW requires 18 bytes per model parameter plus activation memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_42.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..af6f511beeb8e39c59d9e667b136710a53ed487f --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_42.txt @@ -0,0 +1,2 @@ +For +inference there are no optimizer states and gradients, so we can subtract those. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_43.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd972b59b2f14d2f84301cd9da08317288fafc86 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_43.txt @@ -0,0 +1,2 @@ +And thus we end up with 6 bytes per +model parameter for mixed precision inference, plus activation memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_44.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..c04a7cd9af2971e47cbb4d7b0aee3f1a69458ff3 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_44.txt @@ -0,0 +1 @@ +Let's look at the details. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_45.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..0898f5cb6bcf021e92411ce8392bbb4b21352200 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_45.txt @@ -0,0 +1,18 @@ +Model Weights: + +4 bytes * number of parameters for fp32 training +6 bytes * number of parameters for mixed precision training (maintains a model in fp32 and one in fp16 in memory) + +Optimizer States: + +8 bytes * number of parameters for normal AdamW (maintains 2 states) +2 bytes * number of parameters for 8-bit AdamW optimizers like bitsandbytes +4 bytes * number of parameters for optimizers like SGD with momentum (maintains only 1 state) + +Gradients + +4 bytes * number of parameters for either fp32 or mixed precision training (gradients are always kept in fp32) + +Forward Activations + +size depends on many factors, the key ones being sequence length, hidden size and batch size. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_46.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba29679ecc202b2861d32b2c8aff765820678e5d --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_46.txt @@ -0,0 +1,2 @@ +There are the input and output that are being passed and returned by the forward and the backward functions and the +forward activations saved for gradient computation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_47.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..32945e8f7e9934bfcf39ecf0978908568e45525c --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_47.txt @@ -0,0 +1,3 @@ +Temporary Memory +Additionally, there are all kinds of temporary variables which get released once the calculation is done, but in the +moment these could require additional memory and could push to OOM. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_48.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..ade92f48fc14f683ad60411c3c6b0b8ca817b532 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_48.txt @@ -0,0 +1,2 @@ +Therefore, when coding it's crucial to think +strategically about such temporary variables and sometimes to explicitly free those as soon as they are no longer needed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_49.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3251068f8c7b6e695905f914dac7f6582f73942 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_49.txt @@ -0,0 +1,2 @@ +Functionality-specific memory +Then, your software could have special memory needs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_5.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..c23ba5a1355d907d17b1bcbd7d6e62d45f35ea1f --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_5.txt @@ -0,0 +1 @@ +In total, we get 512 sequences each with length 512 and store them in a [~datasets.Dataset] with PyTorch format. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_50.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..60ab0c7dd9a3044e58951a1f94c778ca7068524b --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_50.txt @@ -0,0 +1,2 @@ +For example, when generating text using beam search, the software +needs to maintain multiple copies of inputs and outputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_51.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..41ded1e29b02bd5b051a4ac4478db01dcc73af77 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_51.txt @@ -0,0 +1,3 @@ +forward vs backward Execution Speed +For convolutions and linear layers there are 2x flops in the backward compared to the forward, which generally translates +into ~2x slower (sometimes more, because sizes in the backward tend to be more awkward). \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_52.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f8d8ac6af6203ac6f86e75f739b51da2843b7c3 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_52.txt @@ -0,0 +1,3 @@ +Activations are usually +bandwidth-limited, and it’s typical for an activation to have to read more data in the backward than in the forward +(e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_53.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..54eda3afa5b8986deffb2d4e479a8196c91688c5 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_53.txt @@ -0,0 +1,2 @@ +activation forward reads once, writes once, activation backward reads twice, gradOutput and output of the forward, +and writes once, gradInput). \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_54.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..c4a5f482ce29e4cb61c4ac96f6aebae9a0b9c8d4 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_54.txt @@ -0,0 +1 @@ +As you can see, there are potentially a few places where we could save GPU memory or speed up operations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_55.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..f6c24b44e88d5c28c612527cadc39ca1c8abf2dd --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_55.txt @@ -0,0 +1,3 @@ +Now that you understand what affects GPU utilization and computation speed, refer to +the Methods and tools for efficient training on a single GPU documentation page to learn about +performance optimization techniques. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_6.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ced5882d0c920a9d48ab299075371722d9ba77e --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_6.txt @@ -0,0 +1,18 @@ +import numpy as np +from datasets import Dataset +seq_len, dataset_size = 512, 512 +dummy_data = { + "input_ids": np.random.randint(100, 30000, (dataset_size, seq_len)), + "labels": np.random.randint(0, 1, (dataset_size)), + } +ds = Dataset.from_dict(dummy_data) +ds.set_format("pt") + +To print summary statistics for the GPU utilization and the training run with the [Trainer] we define two helper functions: + +from pynvml import * +def print_gpu_utilization(): + nvmlInit() + handle = nvmlDeviceGetHandleByIndex(0) + info = nvmlDeviceGetMemoryInfo(handle) + print(f"GPU memory occupied: {info.used//1024**2} MB.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_7.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..50288142c5548c56c13b2211a334979667d0d943 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_7.txt @@ -0,0 +1,9 @@ +def print_summary(result): + print(f"Time: {result.metrics['train_runtime']:.2f}") + print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}") + print_gpu_utilization() + +Let's verify that we start with a free GPU memory: + +print_gpu_utilization() +GPU memory occupied: 0 MB. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_8.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..59fb8a048c39ad59a88d80692f975b4e153dc573 --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_8.txt @@ -0,0 +1 @@ +That looks good: the GPU memory is not occupied as we would expect before we load any models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_memory_anatomy/chunk_9.txt b/chunked/nltk_chunking/_model_memory_anatomy/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..8992ecd1580ba45e28c73db02e53eecadf76357f --- /dev/null +++ b/chunked/nltk_chunking/_model_memory_anatomy/chunk_9.txt @@ -0,0 +1,2 @@ +If that's not the case on +your machine make sure to stop all processes that are using GPU memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_0.txt b/chunked/nltk_chunking/_model_sharing/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..159cfcd18a187109a45c00dead31228292c59796 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_0.txt @@ -0,0 +1,3 @@ + +Share a model +The last two tutorials showed how you can fine-tune a model with PyTorch, Keras, and 🤗 Accelerate for distributed setups. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_1.txt b/chunked/nltk_chunking/_model_sharing/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..b12111855c6040fb80f8f81cd74ca52ca76ba30a --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_1.txt @@ -0,0 +1 @@ +The next step is to share your model with the community! \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_10.txt b/chunked/nltk_chunking/_model_sharing/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a00891bc71400e3cd599c917fcb42657ed40212 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_10.txt @@ -0,0 +1 @@ +The Model Hub's built-in versioning is based on git and git-lfs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_11.txt b/chunked/nltk_chunking/_model_sharing/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..221f338f5b28a3963870421d1211ed4c8bc6d95d --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_11.txt @@ -0,0 +1 @@ +In other words, you can treat one model as one repository, enabling greater access control and scalability. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_12.txt b/chunked/nltk_chunking/_model_sharing/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e0d084c4448cb9f19ee40aed4e4dd5860536695 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_12.txt @@ -0,0 +1 @@ +Version control allows revisions, a method for pinning a specific version of a model with a commit hash, tag or branch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_13.txt b/chunked/nltk_chunking/_model_sharing/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..80e68723c18af540c77197c8c4df65f7f0f3f0c6 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_13.txt @@ -0,0 +1,10 @@ +As a result, you can load a specific model version with the revision parameter: + +model = AutoModel.from_pretrained( + "julien-c/EsperBERTo-small", revision="v2.0.1" # tag name, or branch name, or commit hash + ) + +Files are also easily edited in a repository, and you can view the commit history as well as the difference: + +Setup +Before sharing a model to the Hub, you will need your Hugging Face credentials. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_14.txt b/chunked/nltk_chunking/_model_sharing/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..652a817860c92bc929f2c437b831867efc10f053 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_14.txt @@ -0,0 +1 @@ +If you have access to a terminal, run the following command in the virtual environment where 🤗 Transformers is installed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_15.txt b/chunked/nltk_chunking/_model_sharing/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1dd7ac50a62d6e613ece3531e793396681aa05c --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_15.txt @@ -0,0 +1,4 @@ +This will store your access token in your Hugging Face cache folder (~/.cache/ by default): + +huggingface-cli login +If you are using a notebook like Jupyter or Colaboratory, make sure you have the huggingface_hub library installed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_16.txt b/chunked/nltk_chunking/_model_sharing/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..91cd10a6a10ef8f9d43d2292a02badc5057ba93a --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_16.txt @@ -0,0 +1 @@ +This library allows you to programmatically interact with the Hub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_17.txt b/chunked/nltk_chunking/_model_sharing/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd475f59a236d63397468884f8ee6b7ec3cf24c1 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_17.txt @@ -0,0 +1,8 @@ +pip install huggingface_hub +Then use notebook_login to sign-in to the Hub, and follow the link here to generate a token to login with: + +from huggingface_hub import notebook_login +notebook_login() + +Convert a model for all frameworks +To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_18.txt b/chunked/nltk_chunking/_model_sharing/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..63519003ba04e6b37e0b064e5effa231917e3d78 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_18.txt @@ -0,0 +1 @@ +While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_19.txt b/chunked/nltk_chunking/_model_sharing/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a08dcbf1731117afd856bbb0ec2666cb6eeb7500 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_19.txt @@ -0,0 +1 @@ +Converting a checkpoint for another framework is easy. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_2.txt b/chunked/nltk_chunking/_model_sharing/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..c497e09384a941ed9d5f92524ed5d564160512c3 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_2.txt @@ -0,0 +1 @@ +At Hugging Face, we believe in openly sharing knowledge and resources to democratize artificial intelligence for everyone. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_20.txt b/chunked/nltk_chunking/_model_sharing/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..94a65d8a1cb4243a0820f63fdac9e7ff8f506e08 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_20.txt @@ -0,0 +1 @@ +Make sure you have PyTorch and TensorFlow installed (see here for installation instructions), and then find the specific model for your task in the other framework. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_21.txt b/chunked/nltk_chunking/_model_sharing/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..8de194264fc7cda0942060e975a0a34b90c57eee --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_21.txt @@ -0,0 +1,24 @@ +Specify from_tf=True to convert a checkpoint from TensorFlow to PyTorch: + +pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True) +pt_model.save_pretrained("path/to/awesome-name-you-picked") +`` + + +Specifyfrom_pt=True` to convert a checkpoint from PyTorch to TensorFlow: + +tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True) + +Then you can save your new TensorFlow model with its new checkpoint: + +tf_model.save_pretrained("path/to/awesome-name-you-picked") + +If a model is available in Flax, you can also convert a checkpoint from PyTorch to Flax: + +flax_model = FlaxDistilBertForSequenceClassification.from_pretrained( + "path/to/awesome-name-you-picked", from_pt=True + ) + +Push a model during training + +Sharing a model to the Hub is as simple as adding an extra parameter or callback. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_22.txt b/chunked/nltk_chunking/_model_sharing/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..574287bc2e4444b554e8cd1ee39ff7ed35a4f0e8 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_22.txt @@ -0,0 +1 @@ +Remember from the fine-tuning tutorial, the [TrainingArguments] class is where you specify hyperparameters and additional training options. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_23.txt b/chunked/nltk_chunking/_model_sharing/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa433370587d7bbff1aa7001a30cc8d874f15bf3 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_23.txt @@ -0,0 +1 @@ +One of these training options includes the ability to push a model directly to the Hub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_24.txt b/chunked/nltk_chunking/_model_sharing/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..67cd43d6f6be6eb3ac1c8cfaa3acd917d7f611d5 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_24.txt @@ -0,0 +1,15 @@ +Set push_to_hub=True in your [TrainingArguments]: + +training_args = TrainingArguments(output_dir="my-awesome-model", push_to_hub=True) + +Pass your training arguments as usual to [Trainer]: + +trainer = Trainer( + model=model, + args=training_args, + train_dataset=small_train_dataset, + eval_dataset=small_eval_dataset, + compute_metrics=compute_metrics, + ) + +After you fine-tune your model, call [~transformers.Trainer.push_to_hub] on [Trainer] to push the trained model to the Hub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_25.txt b/chunked/nltk_chunking/_model_sharing/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..08587548cfb91a8bd582cecdf01fa693c824cd08 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_25.txt @@ -0,0 +1 @@ +🤗 Transformers will even automatically add training hyperparameters, training results and framework versions to your model card! \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_26.txt b/chunked/nltk_chunking/_model_sharing/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ec7cb2e5f9f1420042f3ca5258d7538cef4b03c --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_26.txt @@ -0,0 +1,5 @@ +trainer.push_to_hub() +`` + + +Share a model to the Hub with [PushToHubCallback]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_27.txt b/chunked/nltk_chunking/_model_sharing/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..09047c0ad80ee628a802ec2277b1f20fae4e1eda --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_27.txt @@ -0,0 +1,3 @@ +In the [PushToHubCallback`] function, add: + +An output directory for your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_28.txt b/chunked/nltk_chunking/_model_sharing/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c1c06219a4ae8becf8ce2d40ab03a3ad41ce106 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_28.txt @@ -0,0 +1 @@ +A tokenizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_29.txt b/chunked/nltk_chunking/_model_sharing/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e82facff58572df38d0cec1acb635e3e6a77f0b --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_29.txt @@ -0,0 +1 @@ +The hub_model_id, which is your Hub username and model name. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_3.txt b/chunked/nltk_chunking/_model_sharing/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..db82e33a83ab7ba6171b70735d40d27f7e7c832b --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_3.txt @@ -0,0 +1 @@ +We encourage you to consider sharing your model with the community to help others save time and resources. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_30.txt b/chunked/nltk_chunking/_model_sharing/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e49bd33dba0b9db52cf3ce0eed1b8594a727ccf --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_30.txt @@ -0,0 +1,11 @@ +from transformers import PushToHubCallback +push_to_hub_callback = PushToHubCallback( + output_dir="./your_model_save_path", tokenizer=tokenizer, hub_model_id="your-username/my-awesome-model" + ) + +Add the callback to fit, and 🤗 Transformers will push the trained model to the Hub: + +model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3, callbacks=push_to_hub_callback) + +Use the push_to_hub function +You can also call push_to_hub directly on your model to upload it to the Hub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_31.txt b/chunked/nltk_chunking/_model_sharing/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..a509e3afc22487844121d4e52643ae23223c9fa0 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_31.txt @@ -0,0 +1,5 @@ +Specify your model name in push_to_hub: + +pt_model.push_to_hub("my-awesome-model") + +This creates a repository under your username with the model name my-awesome-model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_32.txt b/chunked/nltk_chunking/_model_sharing/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6d27f6dbe8f4197bdffca5c12bc00f4d5cbe45d --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_32.txt @@ -0,0 +1,10 @@ +Users can now load your model with the from_pretrained function: + +from transformers import AutoModel +model = AutoModel.from_pretrained("your_username/my-awesome-model") + +If you belong to an organization and want to push your model under the organization name instead, just add it to the repo_id: + +pt_model.push_to_hub("my-awesome-org/my-awesome-model") + +The push_to_hub function can also be used to add other files to a model repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_33.txt b/chunked/nltk_chunking/_model_sharing/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..465c30696539c225e95dbf3052e0341426064cef --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_33.txt @@ -0,0 +1,9 @@ +For example, add a tokenizer to a model repository: + +tokenizer.push_to_hub("my-awesome-model") + +Or perhaps you'd like to add the TensorFlow version of your fine-tuned PyTorch model: + +tf_model.push_to_hub("my-awesome-model") + +Now when you navigate to your Hugging Face profile, you should see your newly created model repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_34.txt b/chunked/nltk_chunking/_model_sharing/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..b74c834800e36ef3440cfaf1466a97cf1e296db6 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_34.txt @@ -0,0 +1 @@ +Clicking on the Files tab will display all the files you've uploaded to the repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_35.txt b/chunked/nltk_chunking/_model_sharing/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef32986c92e3863f83eea9cbc68cfacef113e497 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_35.txt @@ -0,0 +1 @@ +For more details on how to create and upload files to a repository, refer to the Hub documentation here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_36.txt b/chunked/nltk_chunking/_model_sharing/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..305fdecf1bbe4ea00bdb516c9fac5a4631b1ec73 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_36.txt @@ -0,0 +1,2 @@ +Upload with the web interface +Users who prefer a no-code approach are able to upload a model through the Hub's web interface. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_37.txt b/chunked/nltk_chunking/_model_sharing/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff6c37af41ed150d92d9542ea5554b1d8a66d918 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_37.txt @@ -0,0 +1,5 @@ +Visit huggingface.co/new to create a new repository: + +From here, add some information about your model: + +Select the owner of the repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_38.txt b/chunked/nltk_chunking/_model_sharing/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec3a5916735cdb3f94facab924201b5cc3c9f29d --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_38.txt @@ -0,0 +1 @@ +This can be yourself or any of the organizations you belong to. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_39.txt b/chunked/nltk_chunking/_model_sharing/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..858ef94b3e83fa88e39495c8c9eed1d665b3e8f2 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_39.txt @@ -0,0 +1 @@ +Pick a name for your model, which will also be the repository name. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_4.txt b/chunked/nltk_chunking/_model_sharing/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..916a2539f823e8f2a9e87a323c18123af34fa30e --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_4.txt @@ -0,0 +1,3 @@ +In this tutorial, you will learn two methods for sharing a trained or fine-tuned model on the Model Hub: + +Programmatically push your files to the Hub. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_40.txt b/chunked/nltk_chunking/_model_sharing/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..265a15d8d1b143e111df06cc88bfaec7025c7277 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_40.txt @@ -0,0 +1 @@ +Choose whether your model is public or private. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_41.txt b/chunked/nltk_chunking/_model_sharing/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef817da96208985d573ad389821568eb151f37a8 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_41.txt @@ -0,0 +1 @@ +Specify the license usage for your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_42.txt b/chunked/nltk_chunking/_model_sharing/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9fa529bdc042d5faf1b4eb621160ece05bc9978 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_42.txt @@ -0,0 +1 @@ +Now click on the Files tab and click on the Add file button to upload a new file to your repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_43.txt b/chunked/nltk_chunking/_model_sharing/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..566e13a494ba1e016f6e4448adc4b4655df6c29f --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_43.txt @@ -0,0 +1 @@ +Then drag-and-drop a file to upload and add a commit message. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_44.txt b/chunked/nltk_chunking/_model_sharing/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..89a536ec3567b43034fef9897d0e023d519809ef --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_44.txt @@ -0,0 +1,2 @@ +Add a model card +To make sure users understand your model's capabilities, limitations, potential biases and ethical considerations, please add a model card to your repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_45.txt b/chunked/nltk_chunking/_model_sharing/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fb1deb55806e2f4ddce04ebf7b466eead998d04 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_45.txt @@ -0,0 +1 @@ +The model card is defined in the README.md file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_46.txt b/chunked/nltk_chunking/_model_sharing/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..c64f26811a7cbfe6af6df6c57a00b7d721ff9af7 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_46.txt @@ -0,0 +1,3 @@ +You can add a model card by: + +Manually creating and uploading a README.md file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_47.txt b/chunked/nltk_chunking/_model_sharing/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..d33d60388b38bd0bc6e53351c1fbbef5a247950c --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_47.txt @@ -0,0 +1 @@ +Clicking on the Edit model card button in your model repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_48.txt b/chunked/nltk_chunking/_model_sharing/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..73d606a14f20f7346026655a0de3a294e823de5e --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_48.txt @@ -0,0 +1 @@ +Take a look at the DistilBert model card for a good example of the type of information a model card should include. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_49.txt b/chunked/nltk_chunking/_model_sharing/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..a527cf8c814642e92ecff1b6c49908300fbc9d8c --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_49.txt @@ -0,0 +1 @@ +For more details about other options you can control in the README.md file such as a model's carbon footprint or widget examples, refer to the documentation here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_5.txt b/chunked/nltk_chunking/_model_sharing/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..f7d0e41ca29533ceec6a91e2c653337b4905ea7f --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_5.txt @@ -0,0 +1 @@ +Drag-and-drop your files to the Hub with the web interface. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_6.txt b/chunked/nltk_chunking/_model_sharing/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..623ef98e29d7001f3be19afaf309b439d0cb4f68 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_6.txt @@ -0,0 +1 @@ +To share a model with the community, you need an account on huggingface.co. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_7.txt b/chunked/nltk_chunking/_model_sharing/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..992c337084d4856a18545e7de83f2d875457e434 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_7.txt @@ -0,0 +1 @@ +You can also join an existing organization or create a new one. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_8.txt b/chunked/nltk_chunking/_model_sharing/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6310007fcb18c320bb5a8bf9d9d1309ead455bd0 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_8.txt @@ -0,0 +1,2 @@ +Repository features +Each repository on the Model Hub behaves like a typical GitHub repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_sharing/chunk_9.txt b/chunked/nltk_chunking/_model_sharing/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..4e6c64317d2066387da086d823b7619a356ff410 --- /dev/null +++ b/chunked/nltk_chunking/_model_sharing/chunk_9.txt @@ -0,0 +1 @@ +Our repositories offer versioning, commit history, and the ability to visualize differences. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_0.txt b/chunked/nltk_chunking/_model_summary/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4ee00df9614d87241b47bcf2ec1dd0217e60ab5 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_0.txt @@ -0,0 +1,3 @@ + +The Transformer model family +Since its introduction in 2017, the original Transformer model has inspired many new and exciting models that extend beyond natural language processing (NLP) tasks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_1.txt b/chunked/nltk_chunking/_model_summary/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba2033c4de5a7732c3ad9d0eab87b668a5d8d239 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_1.txt @@ -0,0 +1 @@ +There are models for predicting the folded structure of proteins, training a cheetah to run, and time series forecasting. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_10.txt b/chunked/nltk_chunking/_model_summary/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6725797a17890912456a62298ef95c07bf9a31a9 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_10.txt @@ -0,0 +1 @@ +For example, ConvNeXt uses non-overlapping sliding windows to patchify an image and a larger kernel to increase its global receptive field. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_100.txt b/chunked/nltk_chunking/_model_summary/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..cced6651a1ecc9da5603bc145dfe1e46d5ab6d5a --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_100.txt @@ -0,0 +1 @@ +Donut is pretrained to read text by predicting the next word based on the image and text annotations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_101.txt b/chunked/nltk_chunking/_model_summary/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4a07389550a2b7c19e5dd81c62a23ff1d016626 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_101.txt @@ -0,0 +1 @@ +The decoder generates a token sequence given a prompt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_102.txt b/chunked/nltk_chunking/_model_summary/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e9dfbeb582fceb458cf221556eb086c89875254 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_102.txt @@ -0,0 +1 @@ +The prompt is represented by a special token for each downstream task. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_103.txt b/chunked/nltk_chunking/_model_summary/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..b66cd26345532bc622906b276a08bd1f37be4a01 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_103.txt @@ -0,0 +1 @@ +For example, document parsing has a special parsing token that is combined with the encoder hidden states to parse the document into a structured output format (JSON). \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_104.txt b/chunked/nltk_chunking/_model_summary/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..f44688c7da6145d7a1ce25efbba20a973d5cc700 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_104.txt @@ -0,0 +1,4 @@ +Reinforcement learning + +Decoder[[rl-decoder]] +The Decision and Trajectory Transformer casts the state, action, and reward as a sequence modeling problem. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_105.txt b/chunked/nltk_chunking/_model_summary/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..cefcb916b6ce3f04fafc7b254346caa30b2d16dc --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_105.txt @@ -0,0 +1 @@ +The Decision Transformer generates a series of actions that lead to a future desired return based on returns-to-go, past states, and actions. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_106.txt b/chunked/nltk_chunking/_model_summary/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..420a23141bcb5f1a9b88e4451f028baa43882cb4 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_106.txt @@ -0,0 +1 @@ +For the last K timesteps, each of the three modalities are converted into token embeddings and processed by a GPT-like model to predict a future action token. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_107.txt b/chunked/nltk_chunking/_model_summary/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..c594fce5c82553fcca85b46a4fc16c0bf34341bf --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_107.txt @@ -0,0 +1 @@ +Trajectory Transformer also tokenizes the states, actions, and rewards and processes them with a GPT architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_108.txt b/chunked/nltk_chunking/_model_summary/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfda2b0001a4fc782086f1118f924907438bc377 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_108.txt @@ -0,0 +1 @@ +Unlike the Decision Transformer, which is focused on reward conditioning, the Trajectory Transformer generates future actions with beam search. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_11.txt b/chunked/nltk_chunking/_model_summary/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2bede31c13ff6b0a1c24570ab8b52e2f0e76c12 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_11.txt @@ -0,0 +1 @@ +ConvNeXt also makes several layer design choices to be more memory-efficient and improve performance, so it competes favorably with Transformers! \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_12.txt b/chunked/nltk_chunking/_model_summary/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..0f9bb5c281bb839e315c9eab72d636fcaaa5a4b2 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_12.txt @@ -0,0 +1,2 @@ +Encoder[[cv-encoder]] +The Vision Transformer (ViT) opened the door to computer vision tasks without convolutions. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_13.txt b/chunked/nltk_chunking/_model_summary/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c99f433be555b35fbe02829bf5eb04cc5977407 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_13.txt @@ -0,0 +1 @@ +ViT uses a standard Transformer encoder, but its main breakthrough was how it treated an image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_14.txt b/chunked/nltk_chunking/_model_summary/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..900f3698b3be5e79cff0e398d0094c3a1e42a29d --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_14.txt @@ -0,0 +1 @@ +It splits an image into fixed-size patches and uses them to create an embedding, just like how a sentence is split into tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_15.txt b/chunked/nltk_chunking/_model_summary/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..407860da1680db1cea5d6c24a53b344bdc4d3386 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_15.txt @@ -0,0 +1 @@ +ViT capitalized on the Transformers' efficient architecture to demonstrate competitive results with the CNNs at the time while requiring fewer resources to train. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_16.txt b/chunked/nltk_chunking/_model_summary/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..dab38145ff8989b2b6b32781c59833e7fd2c29e8 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_16.txt @@ -0,0 +1 @@ +ViT was soon followed by other vision models that could also handle dense vision tasks like segmentation as well as detection. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_17.txt b/chunked/nltk_chunking/_model_summary/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..197971e908a24e66e2e15e2e60ee2509a168e28f --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_17.txt @@ -0,0 +1 @@ +One of these models is the Swin Transformer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_18.txt b/chunked/nltk_chunking/_model_summary/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..b46091ed731929477ad84e19bdd5e9db9df93064 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_18.txt @@ -0,0 +1 @@ +It builds hierarchical feature maps (like a CNN 👀 and unlike ViT) from smaller-sized patches and merges them with neighboring patches in deeper layers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_19.txt b/chunked/nltk_chunking/_model_summary/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..e90622f736647b5226978172f368d2ba89ea4f28 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_19.txt @@ -0,0 +1 @@ +Attention is only computed within a local window, and the window is shifted between attention layers to create connections to help the model learn better. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_2.txt b/chunked/nltk_chunking/_model_summary/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..a1ca5cadfd1a028ee39671b2847d914b11ddd9b9 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_2.txt @@ -0,0 +1 @@ +With so many Transformer variants available, it can be easy to miss the bigger picture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_20.txt b/chunked/nltk_chunking/_model_summary/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..c34ca7f972ecfe02163ae002658e702308910741 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_20.txt @@ -0,0 +1 @@ +Since the Swin Transformer can produce hierarchical feature maps, it is a good candidate for dense prediction tasks like segmentation and detection. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_21.txt b/chunked/nltk_chunking/_model_summary/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..57ddff40c7e00491a6c55221c05801e00b3c6ac6 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_21.txt @@ -0,0 +1 @@ +The SegFormer also uses a Transformer encoder to build hierarchical feature maps, but it adds a simple multilayer perceptron (MLP) decoder on top to combine all the feature maps and make a prediction. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_22.txt b/chunked/nltk_chunking/_model_summary/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..819d99391b07a3fa7e27602e08a4f8a3e11e7a83 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_22.txt @@ -0,0 +1 @@ +Other vision models, like BeIT and ViTMAE, drew inspiration from BERT's pretraining objective. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_23.txt b/chunked/nltk_chunking/_model_summary/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e93d76eb1ee8129a35955c9b2338f12638ec009 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_23.txt @@ -0,0 +1 @@ +BeIT is pretrained by masked image modeling (MIM); the image patches are randomly masked, and the image is also tokenized into visual tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_24.txt b/chunked/nltk_chunking/_model_summary/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ec9fad313a41e4a62c70e9bf7da219c2f6001ef --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_24.txt @@ -0,0 +1 @@ +BeIT is trained to predict the visual tokens corresponding to the masked patches. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_25.txt b/chunked/nltk_chunking/_model_summary/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..910cd0a2fecdd3f1f9f15e08bbaa27c06bb35b0e --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_25.txt @@ -0,0 +1 @@ +ViTMAE has a similar pretraining objective, except it must predict the pixels instead of visual tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_26.txt b/chunked/nltk_chunking/_model_summary/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dbce94fc08cf00b757617a54f3252222bc86e38 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_26.txt @@ -0,0 +1 @@ +What's unusual is 75% of the image patches are masked! \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_27.txt b/chunked/nltk_chunking/_model_summary/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..1afdce934d8ab3be3f1cb0d0e84ba6041baaa0d5 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_27.txt @@ -0,0 +1 @@ +The decoder reconstructs the pixels from the masked tokens and encoded patches. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_28.txt b/chunked/nltk_chunking/_model_summary/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..f759abf54abd5702c25a43fb48f556a1b9d5de37 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_28.txt @@ -0,0 +1 @@ +After pretraining, the decoder is thrown away, and the encoder is ready to be used in downstream tasks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_29.txt b/chunked/nltk_chunking/_model_summary/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..19b603765a8c542e2229439a01682e641cc7ef53 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_29.txt @@ -0,0 +1,2 @@ +Decoder[[cv-decoder]] +Decoder-only vision models are rare because most vision models rely on an encoder to learn an image representation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_3.txt b/chunked/nltk_chunking/_model_summary/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d9a36162494c1e03a4dc2058832be027bb3033a --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_3.txt @@ -0,0 +1 @@ +What all these models have in common is they're based on the original Transformer architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_30.txt b/chunked/nltk_chunking/_model_summary/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..838c35499068bfc648ca7d4fc91e8265da5eadf5 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_30.txt @@ -0,0 +1 @@ +But for use cases like image generation, the decoder is a natural fit, as we've seen from text generation models like GPT-2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_31.txt b/chunked/nltk_chunking/_model_summary/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..86023e180087b017b55af35455fedcaeb8782bc4 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_31.txt @@ -0,0 +1 @@ +ImageGPT uses the same architecture as GPT-2, but instead of predicting the next token in a sequence, it predicts the next pixel in an image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_32.txt b/chunked/nltk_chunking/_model_summary/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..86bcd0b3bb3c45f044c4c00774b50f2c4f60b740 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_32.txt @@ -0,0 +1 @@ +In addition to image generation, ImageGPT could also be finetuned for image classification. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_33.txt b/chunked/nltk_chunking/_model_summary/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..74899c7a21dfb540c49d82e6d06676e21e87b5f7 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_33.txt @@ -0,0 +1,2 @@ +Encoder-decoder[[cv-encoder-decoder]] +Vision models commonly use an encoder (also known as a backbone) to extract important image features before passing them to a Transformer decoder. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_34.txt b/chunked/nltk_chunking/_model_summary/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..312c982b10058f7b79a6e0a94900334f5a4bedfd --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_34.txt @@ -0,0 +1 @@ +DETR has a pretrained backbone, but it also uses the complete Transformer encoder-decoder architecture for object detection. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_35.txt b/chunked/nltk_chunking/_model_summary/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a726408f35fe5368f3858d65f767db9e82d520b --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_35.txt @@ -0,0 +1 @@ +The encoder learns image representations and combines them with object queries (each object query is a learned embedding that focuses on a region or object in an image) in the decoder. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_36.txt b/chunked/nltk_chunking/_model_summary/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..551f924946f76454a5a975d8302cb17af66d386d --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_36.txt @@ -0,0 +1 @@ +DETR predicts the bounding box coordinates and class label for each object query. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_37.txt b/chunked/nltk_chunking/_model_summary/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..1a087dbbe92ffb808c37080556ec441e0c3282a0 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_37.txt @@ -0,0 +1,4 @@ +Natural language processing + +Encoder[[nlp-encoder]] +BERT is an encoder-only Transformer that randomly masks certain tokens in the input to avoid seeing other tokens, which would allow it to "cheat". \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_38.txt b/chunked/nltk_chunking/_model_summary/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..042853f947770dc47c5f02ff554e13be7157dd4b --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_38.txt @@ -0,0 +1 @@ +The pretraining objective is to predict the masked token based on the context. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_39.txt b/chunked/nltk_chunking/_model_summary/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..26a0f0955fe8b1f87a1659d6ef087d06129c6941 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_39.txt @@ -0,0 +1 @@ +This allows BERT to fully use the left and right contexts to help it learn a deeper and richer representation of the inputs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_4.txt b/chunked/nltk_chunking/_model_summary/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e5fb8901cc3b03f2b9cde890ddba25a7f8532e2 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_4.txt @@ -0,0 +1 @@ +Some models only use the encoder or decoder, while others use both. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_40.txt b/chunked/nltk_chunking/_model_summary/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..048ece9e9c736a98c0823d7c4c726acedea94ad2 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_40.txt @@ -0,0 +1 @@ +However, there was still room for improvement in BERT's pretraining strategy. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_41.txt b/chunked/nltk_chunking/_model_summary/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea15c4a27c63b879c029fefe0f85fd7fdba28d77 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_41.txt @@ -0,0 +1 @@ +RoBERTa improved upon this by introducing a new pretraining recipe that includes training for longer and on larger batches, randomly masking tokens at each epoch instead of just once during preprocessing, and removing the next-sentence prediction objective. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_42.txt b/chunked/nltk_chunking/_model_summary/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..e9d1dc15faf2a4f5da81ff584bb6054a0c0e5c81 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_42.txt @@ -0,0 +1 @@ +The dominant strategy to improve performance is to increase the model size. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_43.txt b/chunked/nltk_chunking/_model_summary/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec218fef25a189f84e53c2f00e531d8e5b1719ad --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_43.txt @@ -0,0 +1 @@ +But training large models is computationally expensive. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_44.txt b/chunked/nltk_chunking/_model_summary/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5aa0876384e28f6d980df29dffa4f0ee8a01eed --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_44.txt @@ -0,0 +1 @@ +One way to reduce computational costs is using a smaller model like DistilBERT. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_45.txt b/chunked/nltk_chunking/_model_summary/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..12f2515bde2570301fc8c1e8e43ca4ab535eb089 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_45.txt @@ -0,0 +1 @@ +DistilBERT uses knowledge distillation - a compression technique - to create a smaller version of BERT while keeping nearly all of its language understanding capabilities. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_46.txt b/chunked/nltk_chunking/_model_summary/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..b23afee9752e61bf1bfdf8ad742aafab6f07c1ee --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_46.txt @@ -0,0 +1 @@ +However, most Transformer models continued to trend towards more parameters, leading to new models focused on improving training efficiency. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_47.txt b/chunked/nltk_chunking/_model_summary/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fdad700b0cd037f920c4f8826878b97d72c82b4 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_47.txt @@ -0,0 +1 @@ +ALBERT reduces memory consumption by lowering the number of parameters in two ways: separating the larger vocabulary embedding into two smaller matrices and allowing layers to share parameters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_48.txt b/chunked/nltk_chunking/_model_summary/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef16c4f38ac6987db4800bf209cc1250a2a9454d --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_48.txt @@ -0,0 +1 @@ +DeBERTa added a disentangled attention mechanism where the word and its position are separately encoded in two vectors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_49.txt b/chunked/nltk_chunking/_model_summary/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6b755b5e8beec14577bade47181e638a50db881 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_49.txt @@ -0,0 +1 @@ +The attention is computed from these separate vectors instead of a single vector containing the word and position embeddings. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_5.txt b/chunked/nltk_chunking/_model_summary/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..96c13c8d5ecf7c4da70097ce2dba722bc9743b1d --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_5.txt @@ -0,0 +1 @@ +This provides a useful taxonomy to categorize and examine the high-level differences within models in the Transformer family, and it'll help you understand Transformers you haven't encountered before. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_50.txt b/chunked/nltk_chunking/_model_summary/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc3abf6520898213d15cf36fc5dbeb1e56dcfb01 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_50.txt @@ -0,0 +1 @@ +Longformer also focused on making attention more efficient, especially for processing documents with longer sequence lengths. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_51.txt b/chunked/nltk_chunking/_model_summary/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba6d4a69d8c7af6234153b22eba5333b2f2c4c7a --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_51.txt @@ -0,0 +1 @@ +It uses a combination of local windowed attention (attention only calculated from fixed window size around each token) and global attention (only for specific task tokens like [CLS] for classification) to create a sparse attention matrix instead of a full attention matrix. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_52.txt b/chunked/nltk_chunking/_model_summary/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..e81af808a01aaa70320bea52f5e0e01e2a02d1ca --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_52.txt @@ -0,0 +1,2 @@ +Decoder[[nlp-decoder]] +GPT-2 is a decoder-only Transformer that predicts the next word in the sequence. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_53.txt b/chunked/nltk_chunking/_model_summary/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6aebba67b3d8fe9f209190704466221e477deca --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_53.txt @@ -0,0 +1 @@ +It masks tokens to the right so the model can't "cheat" by looking ahead. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_54.txt b/chunked/nltk_chunking/_model_summary/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0b3214e1ad00b9c3de0f772452d23ed838f7f9d --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_54.txt @@ -0,0 +1 @@ +By pretraining on a massive body of text, GPT-2 became really good at generating text, even if the text is only sometimes accurate or true. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_55.txt b/chunked/nltk_chunking/_model_summary/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..c83a758a7e1299f6d6611e49d1af9444df60c3fa --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_55.txt @@ -0,0 +1 @@ +But GPT-2 lacked the bidirectional context from BERT's pretraining, which made it unsuitable for certain tasks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_56.txt b/chunked/nltk_chunking/_model_summary/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..427c83f7d2358f4e2bb4503b891e77821b08470b --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_56.txt @@ -0,0 +1 @@ +XLNET combines the best of both BERT and GPT-2's pretraining objectives by using a permutation language modeling objective (PLM) that allows it to learn bidirectionally. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_57.txt b/chunked/nltk_chunking/_model_summary/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e7b41fb03ce21ddd96ab1e93f3a6675786bb119 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_57.txt @@ -0,0 +1 @@ +After GPT-2, language models grew even bigger and are now known as large language models (LLMs). \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_58.txt b/chunked/nltk_chunking/_model_summary/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..5abe4601f5ac5f793d268fbc0d1443523a76be33 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_58.txt @@ -0,0 +1 @@ +LLMs demonstrate few- or even zero-shot learning if pretrained on a large enough dataset. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_59.txt b/chunked/nltk_chunking/_model_summary/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..79a97edcdf3a5a3d6d15412fa0fda392b48adcf5 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_59.txt @@ -0,0 +1 @@ +GPT-J is an LLM with 6B parameters and trained on 400B tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_6.txt b/chunked/nltk_chunking/_model_summary/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..0be8a6d9190ee7f3d0f5113bd8e7081c0ef69a96 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_6.txt @@ -0,0 +1 @@ +If you aren't familiar with the original Transformer model or need a refresher, check out the How do Transformers work chapter from the Hugging Face course. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_60.txt b/chunked/nltk_chunking/_model_summary/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..93c5035d5ce79a3b7a5453d32012306b67d1f27b --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_60.txt @@ -0,0 +1 @@ +GPT-J was followed by OPT, a family of decoder-only models, the largest of which is 175B and trained on 180B tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_61.txt b/chunked/nltk_chunking/_model_summary/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..fdfe060c0be6d01fe2eb9ec9423cdaba39883d17 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_61.txt @@ -0,0 +1 @@ +BLOOM was released around the same time, and the largest model in the family has 176B parameters and is trained on 366B tokens in 46 languages and 13 programming languages. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_62.txt b/chunked/nltk_chunking/_model_summary/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0f1da22d07baf159f3b84cf117493052116b0fd --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_62.txt @@ -0,0 +1,2 @@ +Encoder-decoder[[nlp-encoder-decoder]] +BART keeps the original Transformer architecture, but it modifies the pretraining objective with text infilling corruption, where some text spans are replaced with a single mask token. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_63.txt b/chunked/nltk_chunking/_model_summary/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..b805954fa014a3d1731324b55826761738f2649b --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_63.txt @@ -0,0 +1 @@ +The decoder predicts the uncorrupted tokens (future tokens are masked) and uses the encoder's hidden states to help it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_64.txt b/chunked/nltk_chunking/_model_summary/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..cefd056308d5055de68b8b78a4b117b3d0b649b9 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_64.txt @@ -0,0 +1 @@ +Pegasus is similar to BART, but Pegasus masks entire sentences instead of text spans. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_65.txt b/chunked/nltk_chunking/_model_summary/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..e8310caab4c94e102b7616fe6558242bde16d6a9 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_65.txt @@ -0,0 +1 @@ +In addition to masked language modeling, Pegasus is pretrained by gap sentence generation (GSG). \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_66.txt b/chunked/nltk_chunking/_model_summary/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..2df7e5e1a9ff5102b44c95ec4ba4c807e9c24bcc --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_66.txt @@ -0,0 +1 @@ +The GSG objective masks whole sentences important to a document, replacing them with a mask token. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_67.txt b/chunked/nltk_chunking/_model_summary/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..941885d0ac635f3aa3c55fdab34fa353237fa9ba --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_67.txt @@ -0,0 +1 @@ +The decoder must generate the output from the remaining sentences. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_68.txt b/chunked/nltk_chunking/_model_summary/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..54e7dfc1e72547dbf31e497b9aca3e19adcfef2a --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_68.txt @@ -0,0 +1 @@ +T5 is a more unique model that casts all NLP tasks into a text-to-text problem using specific prefixes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_69.txt b/chunked/nltk_chunking/_model_summary/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a875af3d3b10e4c67948d2771287512764040e0 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_69.txt @@ -0,0 +1 @@ +For example, the prefix Summarize: indicates a summarization task. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_7.txt b/chunked/nltk_chunking/_model_summary/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6aa0a1ec3b74f40cba8c7d2ea2ed3fb4416ba7a --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_7.txt @@ -0,0 +1,4 @@ +Computer vision + +Convolutional network +For a long time, convolutional networks (CNNs) were the dominant paradigm for computer vision tasks until the Vision Transformer demonstrated its scalability and efficiency. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_70.txt b/chunked/nltk_chunking/_model_summary/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..b6ed21d4e103efa39d922d8d3ab0424ecb2352c0 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_70.txt @@ -0,0 +1 @@ +T5 is pretrained by supervised (GLUE and SuperGLUE) training and self-supervised training (randomly sample and drop out 15% of tokens). \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_71.txt b/chunked/nltk_chunking/_model_summary/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..c51d5db86cfa0e5b8205aa569868b990f566a6e7 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_71.txt @@ -0,0 +1,4 @@ +Audio + +Encoder[[audio-encoder]] +Wav2Vec2 uses a Transformer encoder to learn speech representations directly from raw audio waveforms. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_72.txt b/chunked/nltk_chunking/_model_summary/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..3acce02fb2baaaba20feed66c0242df18f48775b --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_72.txt @@ -0,0 +1 @@ +It is pretrained with a contrastive task to determine the true speech representation from a set of false ones. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_73.txt b/chunked/nltk_chunking/_model_summary/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..ceb5be73c9770ba50a096ba1cb9fc59a4a4e3eeb --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_73.txt @@ -0,0 +1 @@ +HuBERT is similar to Wav2Vec2 but has a different training process. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_74.txt b/chunked/nltk_chunking/_model_summary/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..81e0fc875be3def47aad40013ff3701d96203c49 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_74.txt @@ -0,0 +1 @@ +Target labels are created by a clustering step in which segments of similar audio are assigned to a cluster which becomes a hidden unit. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_75.txt b/chunked/nltk_chunking/_model_summary/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..3b18ff4466a4d4eb51036482dbcc1ebc733a2095 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_75.txt @@ -0,0 +1 @@ +The hidden unit is mapped to an embedding to make a prediction. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_76.txt b/chunked/nltk_chunking/_model_summary/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..509001635e7695ac30c75adde603537b1158ad05 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_76.txt @@ -0,0 +1,2 @@ +Encoder-decoder[[audio-encoder-decoder]] +Speech2Text is a speech model designed for automatic speech recognition (ASR) and speech translation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_77.txt b/chunked/nltk_chunking/_model_summary/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..014b31cf30b5d3f1f3fdd122f17920d681cf8fcc --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_77.txt @@ -0,0 +1 @@ +The model accepts log mel-filter bank features extracted from the audio waveform and pretrained autoregressively to generate a transcript or translation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_78.txt b/chunked/nltk_chunking/_model_summary/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..cf523a00f70e3a4fabc2af86afc8c069345adbc7 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_78.txt @@ -0,0 +1 @@ +Whisper is also an ASR model, but unlike many other speech models, it is pretrained on a massive amount of ✨ labeled ✨ audio transcription data for zero-shot performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_79.txt b/chunked/nltk_chunking/_model_summary/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..845260540f5c8deb2a8b0009bf7eba0805e73d3c --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_79.txt @@ -0,0 +1 @@ +A large chunk of the dataset also contains non-English languages, meaning Whisper can also be used for low-resource languages. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_8.txt b/chunked/nltk_chunking/_model_summary/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ad58b22bbdc6cbae7af67afc5f10d4b0a9eb09b --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_8.txt @@ -0,0 +1 @@ +Even then, some of a CNN's best qualities, like translation invariance, are so powerful (especially for certain tasks) that some Transformers incorporate convolutions in their architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_80.txt b/chunked/nltk_chunking/_model_summary/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..713ecda414ae133232319a733ff0981b9eb7a89d --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_80.txt @@ -0,0 +1 @@ +Structurally, Whisper is similar to Speech2Text. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_81.txt b/chunked/nltk_chunking/_model_summary/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..cb41278e6531360211377ba1efe1233b9ba31672 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_81.txt @@ -0,0 +1 @@ +The audio signal is converted to a log-mel spectrogram encoded by the encoder. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_82.txt b/chunked/nltk_chunking/_model_summary/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd4883848164661ed2783788b6162a7c972ce3c2 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_82.txt @@ -0,0 +1 @@ +The decoder generates the transcript autoregressively from the encoder's hidden states and the previous tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_83.txt b/chunked/nltk_chunking/_model_summary/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..43bcb8cf0ee30293fab95f88217cd8f6c45fee79 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_83.txt @@ -0,0 +1,4 @@ +Multimodal + +Encoder[[mm-encoder]] +VisualBERT is a multimodal model for vision-language tasks released shortly after BERT. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_84.txt b/chunked/nltk_chunking/_model_summary/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..21d9f2d708d0d299751160dc3bf4c9a2c7a191dc --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_84.txt @@ -0,0 +1 @@ +It combines BERT and a pretrained object detection system to extract image features into visual embeddings, passed alongside text embeddings to BERT. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_85.txt b/chunked/nltk_chunking/_model_summary/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b78d5ae8b3dbd131d2166330e0ed11de3ecf078 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_85.txt @@ -0,0 +1 @@ +VisualBERT predicts the masked text based on the unmasked text and the visual embeddings, and it also has to predict whether the text is aligned with the image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_86.txt b/chunked/nltk_chunking/_model_summary/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6f497a1f496d1bc335fa7c5e553f9f4ca2c8637 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_86.txt @@ -0,0 +1 @@ +When ViT was released, ViLT adopted ViT in its architecture because it was easier to get the image embeddings this way. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_87.txt b/chunked/nltk_chunking/_model_summary/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..65847164ba0e6bab5fe8afb8ddc63229649ece5a --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_87.txt @@ -0,0 +1 @@ +The image embeddings are jointly processed with the text embeddings. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_88.txt b/chunked/nltk_chunking/_model_summary/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa4c02db001428cea0585194764a31bd2c302c54 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_88.txt @@ -0,0 +1 @@ +From there, ViLT is pretrained by image text matching, masked language modeling, and whole word masking. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_89.txt b/chunked/nltk_chunking/_model_summary/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa2604498c500fb02da75f184a7135ef60b18500 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_89.txt @@ -0,0 +1 @@ +CLIP takes a different approach and makes a pair prediction of (image, text) . \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_9.txt b/chunked/nltk_chunking/_model_summary/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec206ae757e96fea1992f30831149ee9d4e305eb --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_9.txt @@ -0,0 +1 @@ +ConvNeXt flipped this exchange around and incorporated design choices from Transformers to modernize a CNN. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_90.txt b/chunked/nltk_chunking/_model_summary/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..cfe73a219a087f48f91f722dff98091738b1c5e8 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_90.txt @@ -0,0 +1 @@ +An image encoder (ViT) and a text encoder (Transformer) are jointly trained on a 400 million (image, text) pair dataset to maximize the similarity between the image and text embeddings of the (image, text) pairs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_91.txt b/chunked/nltk_chunking/_model_summary/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c6cf84055e345d8bb98ea32b7a242cd4cc7cd70 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_91.txt @@ -0,0 +1 @@ +After pretraining, you can use natural language to instruct CLIP to predict the text given an image or vice versa. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_92.txt b/chunked/nltk_chunking/_model_summary/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..d051ef31bc384e6530970c519880ca5b5ac32a1a --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_92.txt @@ -0,0 +1 @@ +OWL-ViT builds on top of CLIP by using it as its backbone for zero-shot object detection. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_93.txt b/chunked/nltk_chunking/_model_summary/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..92abd85f2a18823d36656f7ee76c80eaf608e6b3 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_93.txt @@ -0,0 +1 @@ +After pretraining, an object detection head is added to make a set prediction over the (class, bounding box) pairs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_94.txt b/chunked/nltk_chunking/_model_summary/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cb911448b344d9e64c2e9605fd0739bca191c65 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_94.txt @@ -0,0 +1,2 @@ +Encoder-decoder[[mm-encoder-decoder]] +Optical character recognition (OCR) is a long-standing text recognition task that typically involves several components to understand the image and generate the text. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_95.txt b/chunked/nltk_chunking/_model_summary/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..e25041cb47ec5357228565135a0ac8810ffffe2c --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_95.txt @@ -0,0 +1 @@ +TrOCR simplifies the process using an end-to-end Transformer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_96.txt b/chunked/nltk_chunking/_model_summary/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0a2273c94123fc68ab3a9ab0827638460d14636 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_96.txt @@ -0,0 +1 @@ +The encoder is a ViT-style model for image understanding and processes the image as fixed-size patches. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_97.txt b/chunked/nltk_chunking/_model_summary/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..96bb682e6984f07638f3720aec41bd13d1030e94 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_97.txt @@ -0,0 +1 @@ +The decoder accepts the encoder's hidden states and autoregressively generates text. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_98.txt b/chunked/nltk_chunking/_model_summary/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..eaf0ba3284797af3cc1e6ab564dc671802fe433d --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_98.txt @@ -0,0 +1 @@ +Donut is a more general visual document understanding model that doesn't rely on OCR-based approaches. \ No newline at end of file diff --git a/chunked/nltk_chunking/_model_summary/chunk_99.txt b/chunked/nltk_chunking/_model_summary/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..76212bd1b7a5c97c2240102c8564c0a40b7825e7 --- /dev/null +++ b/chunked/nltk_chunking/_model_summary/chunk_99.txt @@ -0,0 +1 @@ +It uses a Swin Transformer as the encoder and multilingual BART as the decoder. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_0.txt b/chunked/nltk_chunking/_multilingual/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea06c91897374fc46659a3271f77f1268c207368 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_0.txt @@ -0,0 +1,4 @@ + +Multilingual models for inference +[[open-in-colab]] +There are several multilingual models in 🤗 Transformers, and their inference usage differs from monolingual models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_1.txt b/chunked/nltk_chunking/_multilingual/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..4198f535ac94f78040d9f35c27d8d2da659d1739 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_1.txt @@ -0,0 +1 @@ +Not all multilingual model usage is different though. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_10.txt b/chunked/nltk_chunking/_multilingual/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..4159a54bfa27c1f277ffa908ab419f864e4e8463 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_10.txt @@ -0,0 +1 @@ +This tensor should be the same size as input_ids. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_11.txt b/chunked/nltk_chunking/_multilingual/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa22000157aa3c9bc1375ea6ed0818f5bc8c13d6 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_11.txt @@ -0,0 +1,10 @@ +language_id = tokenizer.lang2id["en"] # 0 +langs = torch.tensor([language_id] * input_ids.shape[1]) # torch.tensor([0, 0, 0, , 0]) +We reshape it to be of size (batch_size, sequence_length) +langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1) + +Now you can pass the input_ids and language embedding to the model: + +outputs = model(input_ids, langs=langs) + +The run_generation.py script can generate text with language embeddings using the xlm-clm checkpoints. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_12.txt b/chunked/nltk_chunking/_multilingual/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..36efcb8ad1abf3e95e81fb4ea61726da5e7c9388 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_12.txt @@ -0,0 +1,7 @@ +XLM without language embeddings +The following XLM models do not require language embeddings during inference: + +FacebookAI/xlm-mlm-17-1280 (Masked language modeling, 17 languages) +FacebookAI/xlm-mlm-100-1280 (Masked language modeling, 100 languages) + +These models are used for generic sentence representations, unlike the previous XLM checkpoints. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_13.txt b/chunked/nltk_chunking/_multilingual/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5cfcd23e0cdd59118d8b13f903a605140b850d7 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_13.txt @@ -0,0 +1,7 @@ +BERT +The following BERT models can be used for multilingual tasks: + +google-bert/bert-base-multilingual-uncased (Masked language modeling + Next sentence prediction, 102 languages) +google-bert/bert-base-multilingual-cased (Masked language modeling + Next sentence prediction, 104 languages) + +These models do not require language embeddings during inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_14.txt b/chunked/nltk_chunking/_multilingual/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..634b63398c347de9be9180aa2a9c424ec36ef9c5 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_14.txt @@ -0,0 +1,2 @@ +They should identify the language from the +context and infer accordingly. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_15.txt b/chunked/nltk_chunking/_multilingual/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..b821fe00c713a25e48621f879190f616618610a8 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_15.txt @@ -0,0 +1,7 @@ +XLM-RoBERTa +The following XLM-RoBERTa models can be used for multilingual tasks: + +FacebookAI/xlm-roberta-base (Masked language modeling, 100 languages) +FacebookAI/xlm-roberta-large (Masked language modeling, 100 languages) + +XLM-RoBERTa was trained on 2.5TB of newly created and cleaned CommonCrawl data in 100 languages. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_16.txt b/chunked/nltk_chunking/_multilingual/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..75391905722c4495a708ca805cec72273a890858 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_16.txt @@ -0,0 +1 @@ +It provides strong gains over previously released multilingual models like mBERT or XLM on downstream tasks like classification, sequence labeling, and question answering. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_17.txt b/chunked/nltk_chunking/_multilingual/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..de719a8130815b1fd6daa3a6d7dcdd304c7a9aa8 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_17.txt @@ -0,0 +1,7 @@ +M2M100 +The following M2M100 models can be used for multilingual translation: + +facebook/m2m100_418M (Translation) +facebook/m2m100_1.2B (Translation) + +In this example, load the facebook/m2m100_418M checkpoint to translate from Chinese to English. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_18.txt b/chunked/nltk_chunking/_multilingual/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..0071e4bc41b333e0aa1759539ea01cbb902b5abd --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_18.txt @@ -0,0 +1,4 @@ +You can set the source language in the tokenizer: + +from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer +en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_19.txt b/chunked/nltk_chunking/_multilingual/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..a042c0b65e0ede9d5e5496f2ff9604426126aca7 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_19.txt @@ -0,0 +1 @@ +chinese_text = "ä¸è¦æ’手巫師的事務, 因為他們是微妙的, 很快就會發怒." \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_2.txt b/chunked/nltk_chunking/_multilingual/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..75f041483154d84ef3ed3e8b89a568af90fde168 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_2.txt @@ -0,0 +1 @@ +Some models, like google-bert/bert-base-multilingual-uncased, can be used just like a monolingual model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_20.txt b/chunked/nltk_chunking/_multilingual/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..f98aec7c41063301e36f4658b5b3ce421029a8ab --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_20.txt @@ -0,0 +1,8 @@ +tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh") +model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M") + +Tokenize the text: + +encoded_zh = tokenizer(chinese_text, return_tensors="pt") + +M2M100 forces the target language id as the first generated token to translate to the target language. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_21.txt b/chunked/nltk_chunking/_multilingual/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5ffc2efdd5e572bf881dcfddebc909a89c87b0b --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_21.txt @@ -0,0 +1,5 @@ +Set the forced_bos_token_id to en in the generate method to translate to English: + +generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en")) +tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.' \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_22.txt b/chunked/nltk_chunking/_multilingual/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..edcc1a689693cae0f6c73126d4e31c047053a8e8 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_22.txt @@ -0,0 +1,10 @@ +MBart +The following MBart models can be used for multilingual translation: + +facebook/mbart-large-50-one-to-many-mmt (One-to-many multilingual machine translation, 50 languages) +facebook/mbart-large-50-many-to-many-mmt (Many-to-many multilingual machine translation, 50 languages) +facebook/mbart-large-50-many-to-one-mmt (Many-to-one multilingual machine translation, 50 languages) +facebook/mbart-large-50 (Multilingual translation, 50 languages) +facebook/mbart-large-cc25 + +In this example, load the facebook/mbart-large-50-many-to-many-mmt checkpoint to translate Finnish to English. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_23.txt b/chunked/nltk_chunking/_multilingual/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..6bea13207ae3f320609b54dad0a0db3579ab0b68 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_23.txt @@ -0,0 +1,4 @@ +You can set the source language in the tokenizer: + +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM +en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger." \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_24.txt b/chunked/nltk_chunking/_multilingual/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..296b9a51aa6880b25051fb36440de534e93656a1 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_24.txt @@ -0,0 +1 @@ +fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia." \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_25.txt b/chunked/nltk_chunking/_multilingual/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..844f708e84ba52ccda771b70e184de96b38ef1f6 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_25.txt @@ -0,0 +1,8 @@ +tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI") +model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt") + +Tokenize the text: + +encoded_en = tokenizer(en_text, return_tensors="pt") + +MBart forces the target language id as the first generated token to translate to the target language. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_26.txt b/chunked/nltk_chunking/_multilingual/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bdbc6566552ba1ad5ef890c9a9e74168f5598a2 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_26.txt @@ -0,0 +1,5 @@ +Set the forced_bos_token_id to en in the generate method to translate to English: + +generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"]) +tokenizer.batch_decode(generated_tokens, skip_special_tokens=True) +"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry." \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_27.txt b/chunked/nltk_chunking/_multilingual/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca4ca1ddefff73060cf178a4bf86b3ac17905118 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_27.txt @@ -0,0 +1 @@ +If you are using the facebook/mbart-large-50-many-to-one-mmt checkpoint, you don't need to force the target language id as the first generated token otherwise the usage is the same. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_3.txt b/chunked/nltk_chunking/_multilingual/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6d3a38c1c895ba8e0e20c256d9b84ea31de142b --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_3.txt @@ -0,0 +1 @@ +This guide will show you how to use multilingual models whose usage differs for inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_4.txt b/chunked/nltk_chunking/_multilingual/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2c9b3ae60dc2ec62504901e50de7cfc958db0e2 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_4.txt @@ -0,0 +1,2 @@ +XLM +XLM has ten different checkpoints, only one of which is monolingual. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_5.txt b/chunked/nltk_chunking/_multilingual/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..2430baf7cf5d15edfaf732b98e796903324465c8 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_5.txt @@ -0,0 +1 @@ +The nine remaining model checkpoints can be split into two categories: the checkpoints that use language embeddings and those that don't. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_6.txt b/chunked/nltk_chunking/_multilingual/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dfacf3b2e1ee6e617a364433419c1bb8318e0ba --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_6.txt @@ -0,0 +1,12 @@ +XLM with language embeddings +The following XLM models use language embeddings to specify the language used at inference: + +FacebookAI/xlm-mlm-ende-1024 (Masked language modeling, English-German) +FacebookAI/xlm-mlm-enfr-1024 (Masked language modeling, English-French) +FacebookAI/xlm-mlm-enro-1024 (Masked language modeling, English-Romanian) +FacebookAI/xlm-mlm-xnli15-1024 (Masked language modeling, XNLI languages) +FacebookAI/xlm-mlm-tlm-xnli15-1024 (Masked language modeling + translation, XNLI languages) +FacebookAI/xlm-clm-enfr-1024 (Causal language modeling, English-French) +FacebookAI/xlm-clm-ende-1024 (Causal language modeling, English-German) + +Language embeddings are represented as a tensor of the same shape as the input_ids passed to the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_7.txt b/chunked/nltk_chunking/_multilingual/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c22bdb85bd145cd08bc0d58897b3cef9337f0b6a --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_7.txt @@ -0,0 +1 @@ +The values in these tensors depend on the language used and are identified by the tokenizer's lang2id and id2lang attributes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_8.txt b/chunked/nltk_chunking/_multilingual/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..f618f397e007ab2c086abffa56c1f38fd0c46734 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_8.txt @@ -0,0 +1,17 @@ +In this example, load the FacebookAI/xlm-clm-enfr-1024 checkpoint (Causal language modeling, English-French): + +import torch +from transformers import XLMTokenizer, XLMWithLMHeadModel +tokenizer = XLMTokenizer.from_pretrained("FacebookAI/xlm-clm-enfr-1024") +model = XLMWithLMHeadModel.from_pretrained("FacebookAI/xlm-clm-enfr-1024") + +The lang2id attribute of the tokenizer displays this model's languages and their ids: + +print(tokenizer.lang2id) +{'en': 0, 'fr': 1} + +Next, create an example input: + +input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1 + +Set the language id as "en" and use it to define the language embedding. \ No newline at end of file diff --git a/chunked/nltk_chunking/_multilingual/chunk_9.txt b/chunked/nltk_chunking/_multilingual/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..3dc3b1726aa8949ca36376da0366494a2d8d1094 --- /dev/null +++ b/chunked/nltk_chunking/_multilingual/chunk_9.txt @@ -0,0 +1 @@ +The language embedding is a tensor filled with 0 since that is the language id for English. \ No newline at end of file diff --git a/chunked/nltk_chunking/_notebooks/chunk_0.txt b/chunked/nltk_chunking/_notebooks/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..10fb7a7b979ad8a87ca2401f5b363ab1bdadfdd6 --- /dev/null +++ b/chunked/nltk_chunking/_notebooks/chunk_0.txt @@ -0,0 +1 @@ +../../../notebooks/README.md \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_0.txt b/chunked/nltk_chunking/_pad_truncation/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..5eb20e079338d3a7111b668bd0659da949ede4fb --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_0.txt @@ -0,0 +1,3 @@ + +Padding and truncation +Batched inputs are often different lengths, so they can't be converted to fixed-size tensors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_1.txt b/chunked/nltk_chunking/_pad_truncation/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..89facab8bf33cb53b47ed3a01b0252f8d51708d0 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_1.txt @@ -0,0 +1 @@ +Padding and truncation are strategies for dealing with this problem, to create rectangular tensors from batches of varying lengths. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_10.txt b/chunked/nltk_chunking/_pad_truncation/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..d039d05dace03a142e7a7739009142150edea9e8 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_10.txt @@ -0,0 +1 @@ +Padding will still be applied if you only provide a single sequence. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_11.txt b/chunked/nltk_chunking/_pad_truncation/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..3c4c05674c138dc3c01848589afda92fb52f15f8 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_11.txt @@ -0,0 +1 @@ +False or 'do_not_pad': no padding is applied. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_12.txt b/chunked/nltk_chunking/_pad_truncation/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..049a5a8078835d009eb59dd034f210db553bc64c --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_12.txt @@ -0,0 +1 @@ +This is the default behavior. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_13.txt b/chunked/nltk_chunking/_pad_truncation/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2ffd8c0c7e436cd6327a8ecd426a3b3d303e897 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_13.txt @@ -0,0 +1 @@ +The truncation argument controls truncation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_14.txt b/chunked/nltk_chunking/_pad_truncation/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..17cbe34d9c3c5779271e35f2510ad0e452cc05ab --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_14.txt @@ -0,0 +1,4 @@ +It can be a boolean or a string: + +True or 'longest_first': truncate to a maximum length specified by the max_length argument or + the maximum length accepted by the model if no max_length is provided (max_length=None). \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_15.txt b/chunked/nltk_chunking/_pad_truncation/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..393940a68c468a77d3eeafd842e16cb9b35c22c5 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_15.txt @@ -0,0 +1,3 @@ +This will + truncate token by token, removing a token from the longest sequence in the pair until the proper length is + reached. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_16.txt b/chunked/nltk_chunking/_pad_truncation/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..9e7f4561b8a9a86ac5f564074cfe09d74abec3bf --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_16.txt @@ -0,0 +1,2 @@ +'only_second': truncate to a maximum length specified by the max_length argument or the maximum + length accepted by the model if no max_length is provided (max_length=None). \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_17.txt b/chunked/nltk_chunking/_pad_truncation/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..044c84abbc5822ce50222ed6e5818053a3fe0569 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_17.txt @@ -0,0 +1,2 @@ +This will only truncate + the second sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_18.txt b/chunked/nltk_chunking/_pad_truncation/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..aa48350cf9a22e8f0c7f4e2671b01f512ed3629d --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_18.txt @@ -0,0 +1,2 @@ +'only_first': truncate to a maximum length specified by the max_length argument or the maximum + length accepted by the model if no max_length is provided (max_length=None). \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_19.txt b/chunked/nltk_chunking/_pad_truncation/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..9797409b9bbbb2d6806a28f07fcbde04431413c0 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_19.txt @@ -0,0 +1,2 @@ +This will only truncate + the first sentence of a pair if a pair of sequences (or a batch of pairs of sequences) is provided. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_2.txt b/chunked/nltk_chunking/_pad_truncation/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0be3c874ce66ad3f87d4ce71af019a4f1d89a24 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_2.txt @@ -0,0 +1 @@ +Padding adds a special padding token to ensure shorter sequences will have the same length as either the longest sequence in a batch or the maximum length accepted by the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_20.txt b/chunked/nltk_chunking/_pad_truncation/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..afb7558c6e4077bde81cff82d44112f04def18a7 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_20.txt @@ -0,0 +1 @@ +False or 'do_not_truncate': no truncation is applied. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_21.txt b/chunked/nltk_chunking/_pad_truncation/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..049a5a8078835d009eb59dd034f210db553bc64c --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_21.txt @@ -0,0 +1 @@ +This is the default behavior. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_22.txt b/chunked/nltk_chunking/_pad_truncation/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..d819dc4ab95dcf63c7a9d82273702d8479966731 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_22.txt @@ -0,0 +1 @@ +The max_length argument controls the length of the padding and truncation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_23.txt b/chunked/nltk_chunking/_pad_truncation/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..3190d9599fec8ca2f5850cd5024dd2b2b5c21f84 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_23.txt @@ -0,0 +1 @@ +It can be an integer or None, in which case it will default to the maximum length the model can accept. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_24.txt b/chunked/nltk_chunking/_pad_truncation/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..5442d473cc6741a38dfbe77b42bc42859f892cf2 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_24.txt @@ -0,0 +1 @@ +If the model has no specific maximum input length, truncation or padding to max_length is deactivated. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_25.txt b/chunked/nltk_chunking/_pad_truncation/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..0eb3914bcde8e9d779323d03dec6a13ee4040933 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_25.txt @@ -0,0 +1 @@ +The following table summarizes the recommended way to setup padding and truncation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_26.txt b/chunked/nltk_chunking/_pad_truncation/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f96ed4618f3a7fccf85b80770ceea469d68d9ac --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_26.txt @@ -0,0 +1,2 @@ +If you use pairs of input sequences in any of the following examples, you can replace truncation=True by a STRATEGY selected in +['only_first', 'only_second', 'longest_first'], i.e. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_27.txt b/chunked/nltk_chunking/_pad_truncation/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ecca11e0f74413be99dbc68db0a981f83bd1fe4 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_27.txt @@ -0,0 +1 @@ +truncation='only_second' or truncation='longest_first' to control how both sequences in the pair are truncated as detailed before. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_28.txt b/chunked/nltk_chunking/_pad_truncation/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..27aa7faac2a2a130a1cb018d9914344e20ef9097 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_28.txt @@ -0,0 +1,22 @@ +| Truncation | Padding | Instruction | +|--------------------------------------|-----------------------------------|---------------------------------------------------------------------------------------------| +| no truncation | no padding | tokenizer(batch_sentences) | +| | padding to max sequence in batch | tokenizer(batch_sentences, padding=True) or | +| | | tokenizer(batch_sentences, padding='longest') | +| | padding to max model input length | tokenizer(batch_sentences, padding='max_length') | +| | padding to specific length | tokenizer(batch_sentences, padding='max_length', max_length=42) | +| | padding to a multiple of a value | tokenizer(batch_sentences, padding=True, pad_to_multiple_of=8) | +| truncation to max model input length | no padding | tokenizer(batch_sentences, truncation=True) or | +| | | tokenizer(batch_sentences, truncation=STRATEGY) | +| | padding to max sequence in batch | tokenizer(batch_sentences, padding=True, truncation=True) or | +| | | tokenizer(batch_sentences, padding=True, truncation=STRATEGY) | +| | padding to max model input length | tokenizer(batch_sentences, padding='max_length', truncation=True) or | +| | | tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY) | +| | padding to specific length | Not possible | +| truncation to specific length | no padding | tokenizer(batch_sentences, truncation=True, max_length=42) or | +| | | tokenizer(batch_sentences, truncation=STRATEGY, max_length=42) | +| | padding to max sequence in batch | tokenizer(batch_sentences, padding=True, truncation=True, max_length=42) or | +| | | tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42) | +| | padding to max model input length | Not possible | +| | padding to specific length | tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42) or | +| | | tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42) | \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_3.txt b/chunked/nltk_chunking/_pad_truncation/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..dec28b66b399435cf9aaa0aadd43dd840c0eee0f --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_3.txt @@ -0,0 +1 @@ +Truncation works in the other direction by truncating long sequences. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_4.txt b/chunked/nltk_chunking/_pad_truncation/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..c581358c1c8dd3bfccc63700a5d456052f95f74e --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_4.txt @@ -0,0 +1 @@ +In most cases, padding your batch to the length of the longest sequence and truncating to the maximum length a model can accept works pretty well. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_5.txt b/chunked/nltk_chunking/_pad_truncation/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cdf4ac88a5e3a0e745af8dcab2c735d68a714d1 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_5.txt @@ -0,0 +1 @@ +However, the API supports more strategies if you need them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_6.txt b/chunked/nltk_chunking/_pad_truncation/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..64b97a396af13587ed49b9a45ad83a22f9a2ef04 --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_6.txt @@ -0,0 +1 @@ +The three arguments you need to are: padding, truncation and max_length. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_7.txt b/chunked/nltk_chunking/_pad_truncation/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..39a296682c8d7d6654897ba39e9d0b82348163ac --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_7.txt @@ -0,0 +1 @@ +The padding argument controls padding. \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_8.txt b/chunked/nltk_chunking/_pad_truncation/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..498740f30a5fa3c397f164c8bb89a0db7b07894a --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_8.txt @@ -0,0 +1,4 @@ +It can be a boolean or a string: + +True or 'longest': pad to the longest sequence in the batch (no padding is applied if you only provide + a single sequence). \ No newline at end of file diff --git a/chunked/nltk_chunking/_pad_truncation/chunk_9.txt b/chunked/nltk_chunking/_pad_truncation/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef19bb111bbc788b16e79fb0b8144cc7647de07b --- /dev/null +++ b/chunked/nltk_chunking/_pad_truncation/chunk_9.txt @@ -0,0 +1,2 @@ +'max_length': pad to a length specified by the max_length argument or the maximum length accepted + by the model if no max_length is provided (max_length=None). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_0.txt b/chunked/nltk_chunking/_perf_hardware/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..f37fde8f5a13c8cf1de4698f07cf191c2b6c270b --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_0.txt @@ -0,0 +1,3 @@ + +Custom hardware for training +The hardware you use to run model training and inference can have a big effect on performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_1.txt b/chunked/nltk_chunking/_perf_hardware/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fbd8dafefcd047a6ddde56027302f3be5c3e14d --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_1.txt @@ -0,0 +1 @@ +For a deep dive into GPUs make sure to check out Tim Dettmer's excellent blog post. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_10.txt b/chunked/nltk_chunking/_perf_hardware/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..0789ad9845989844d21512863d72c0772890e9ae --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_10.txt @@ -0,0 +1 @@ +Each PCI-E 8-Pin power cable needs to be plugged into a 12V rail on the PSU side and can supply up to 150W of power. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_11.txt b/chunked/nltk_chunking/_perf_hardware/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..5444fef8632a42aa084c19d92ee959df3ec9e4c0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_11.txt @@ -0,0 +1 @@ +Some other cards may use a PCI-E 12-Pin connectors, and these can deliver up to 500-600W of power. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_12.txt b/chunked/nltk_chunking/_perf_hardware/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..1f26ff3b4de80d1a70c4953a386703be82fe71f5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_12.txt @@ -0,0 +1 @@ +Low end cards may use 6-Pin connectors, which supply up to 75W of power. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_13.txt b/chunked/nltk_chunking/_perf_hardware/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..a835265f549728ee4fbdc18ff3cd2d6cb84d4049 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_13.txt @@ -0,0 +1 @@ +Additionally you want the high-end PSU that has stable voltage. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_14.txt b/chunked/nltk_chunking/_perf_hardware/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..50d60d9bb6d2dde080496b911cda51bd4191c783 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_14.txt @@ -0,0 +1 @@ +Some lower quality ones may not give the card the stable voltage it needs to function at its peak. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_15.txt b/chunked/nltk_chunking/_perf_hardware/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..a028374d982ee45390d4eb4942b69d68696bfd96 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_15.txt @@ -0,0 +1 @@ +And of course the PSU needs to have enough unused Watts to power the card. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_16.txt b/chunked/nltk_chunking/_perf_hardware/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..a9ff56c17b77b5167ace939364c6ada34711f02d --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_16.txt @@ -0,0 +1,2 @@ +Cooling: +When a GPU gets overheated it will start throttling down and will not deliver full performance and it can even shutdown if it gets too hot. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_17.txt b/chunked/nltk_chunking/_perf_hardware/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e604b1eab34b03fd494f8dbea260a5f8c99fa28 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_17.txt @@ -0,0 +1 @@ +It's hard to tell the exact best temperature to strive for when a GPU is heavily loaded, but probably anything under +80C is good, but lower is better - perhaps 70-75C is an excellent range to be in. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_18.txt b/chunked/nltk_chunking/_perf_hardware/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..de8292dfe3592a169eb944a35df961e528b5b98c --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_18.txt @@ -0,0 +1 @@ +The throttling down is likely to start at around 84-90C. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_19.txt b/chunked/nltk_chunking/_perf_hardware/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d974b8c0799cf140e395a0d90c2f4b614f9ce93 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_19.txt @@ -0,0 +1 @@ +But other than throttling performance a prolonged very high temperature is likely to reduce the lifespan of a GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_2.txt b/chunked/nltk_chunking/_perf_hardware/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d310ebca1e805d20a1ac18f453fee18241b6c24 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_2.txt @@ -0,0 +1 @@ +Let's have a look at some practical advice for GPU setups. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_20.txt b/chunked/nltk_chunking/_perf_hardware/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..e14638337f6b46e59a9f9b65e790eeeefb057f32 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_20.txt @@ -0,0 +1 @@ +Next let's have a look at one of the most important aspects when having multiple GPUs: connectivity. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_21.txt b/chunked/nltk_chunking/_perf_hardware/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ab16dfd6398e43d66859e34adc8194207520c4b --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_21.txt @@ -0,0 +1,2 @@ +Multi-GPU Connectivity +If you use multiple GPUs the way cards are inter-connected can have a huge impact on the total training time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_22.txt b/chunked/nltk_chunking/_perf_hardware/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..a21af511e9fa7802be4fb4792a36956417ef57ea --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_22.txt @@ -0,0 +1,4 @@ +If the GPUs are on the same physical node, you can run: + +nvidia-smi topo -m +and it will tell you how the GPUs are inter-connected. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_23.txt b/chunked/nltk_chunking/_perf_hardware/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f4933559eab5b172bb5722264b5b7119c1d74bf --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_23.txt @@ -0,0 +1,17 @@ +On a machine with dual-GPU and which are connected with NVLink, you will most likely see something like: +GPU0 GPU1 CPU Affinity NUMA Affinity +GPU0 X NV2 0-23 N/A +GPU1 NV2 X 0-23 N/A +on a different machine w/o NVLink we may see: +GPU0 GPU1 CPU Affinity NUMA Affinity +GPU0 X PHB 0-11 N/A +GPU1 PHB X 0-11 N/A +The report includes this legend: +X = Self + SYS = Connection traversing PCIe as well as the SMP interconnect between NUMA nodes (e.g., QPI/UPI) + NODE = Connection traversing PCIe as well as the interconnect between PCIe Host Bridges within a NUMA node + PHB = Connection traversing PCIe as well as a PCIe Host Bridge (typically the CPU) + PXB = Connection traversing multiple PCIe bridges (without traversing the PCIe Host Bridge) + PIX = Connection traversing at most a single PCIe bridge + NV# = Connection traversing a bonded set of # NVLinks +So the first report NV2 tells us the GPUs are interconnected with 2 NVLinks, and the second report PHB we have a typical consumer-level PCIe+Bridge setup. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_24.txt b/chunked/nltk_chunking/_perf_hardware/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..83c000e2ad3c194ef5c9be019e78b3637a4e452a --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_24.txt @@ -0,0 +1 @@ +Check what type of connectivity you have on your setup. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_25.txt b/chunked/nltk_chunking/_perf_hardware/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..939fcf5839cddafa9591d1f5a372085c7127c54c --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_25.txt @@ -0,0 +1 @@ +Some of these will make the communication between cards faster (e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_26.txt b/chunked/nltk_chunking/_perf_hardware/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5aa3f7740b0a644a36500bf3deb5f059202cf15 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_26.txt @@ -0,0 +1 @@ +NVLink), others slower (e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_27.txt b/chunked/nltk_chunking/_perf_hardware/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..2dc462a0c6a3feebbb64f10c2dc725d90eab7f52 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_27.txt @@ -0,0 +1 @@ +PHB). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_28.txt b/chunked/nltk_chunking/_perf_hardware/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..acbc60c786195b036f24101ecf27f9f4a5a202b8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_28.txt @@ -0,0 +1 @@ +Depending on the type of scalability solution used, the connectivity speed could have a major or a minor impact. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_29.txt b/chunked/nltk_chunking/_perf_hardware/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..d96a52b33aaa5cd01a5ff806b9bde9247e24dadb --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_29.txt @@ -0,0 +1 @@ +If the GPUs need to sync rarely, as in DDP, the impact of a slower connection will be less significant. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_3.txt b/chunked/nltk_chunking/_perf_hardware/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..269695e00f600a59799198d3bcf39f740b72cef5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_3.txt @@ -0,0 +1,8 @@ +GPU +When you train bigger models you have essentially three options: + +bigger GPUs +more GPUs +more CPU and NVMe (offloaded to by DeepSpeed-Infinity) + +Let's start at the case where you have a single GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_30.txt b/chunked/nltk_chunking/_perf_hardware/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec27df932f418f1e6720d18577c0e42b64ce25bc --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_30.txt @@ -0,0 +1 @@ +If the GPUs need to send messages to each other often, as in ZeRO-DP, then faster connectivity becomes super important to achieve faster training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_31.txt b/chunked/nltk_chunking/_perf_hardware/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..de1a49f76666b90ea1bd2a891b266bcda532b655 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_31.txt @@ -0,0 +1,2 @@ +NVlink +NVLink is a wire-based serial multi-lane near-range communications link developed by Nvidia. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_32.txt b/chunked/nltk_chunking/_perf_hardware/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..0adddac0304f264cbc6b241c51c86279640f75f1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_32.txt @@ -0,0 +1 @@ +Each new generation provides a faster bandwidth, e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_33.txt b/chunked/nltk_chunking/_perf_hardware/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..1dc390f0631b9a3397ecda61ebfa881b7146978b --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_33.txt @@ -0,0 +1,5 @@ +here is a quote from Nvidia Ampere GA102 GPU Architecture: + +Third-Generation NVLink® +GA102 GPUs utilize NVIDIA’s third-generation NVLink interface, which includes four x4 links, +with each link providing 14.0625 GB/sec bandwidth in each direction between two GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_34.txt b/chunked/nltk_chunking/_perf_hardware/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..4884013147c8bb4fa998a8afbc08e71aad9712d6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_34.txt @@ -0,0 +1,3 @@ +Four +links provide 56.25 GB/sec bandwidth in each direction, and 112.5 GB/sec total bandwidth +between two GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_35.txt b/chunked/nltk_chunking/_perf_hardware/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ed6e1c48154c8466884ef92dad7fd90484f5877 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_35.txt @@ -0,0 +1 @@ +Two RTX 3090 GPUs can be connected together for SLI using NVLink. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_36.txt b/chunked/nltk_chunking/_perf_hardware/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..899100e4ff7f7a6861df79ded98dc21b4737a078 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_36.txt @@ -0,0 +1 @@ +(Note that 3-Way and 4-Way SLI configurations are not supported.) \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_37.txt b/chunked/nltk_chunking/_perf_hardware/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..41499e92f896fb33827eb38ed4644c059e665e1f --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_37.txt @@ -0,0 +1 @@ +So the higher X you get in the report of NVX in the output of nvidia-smi topo -m the better. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_38.txt b/chunked/nltk_chunking/_perf_hardware/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..0e378811bab5a2269c2420f116e3fbab0c9ac25a --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_38.txt @@ -0,0 +1 @@ +The generation will depend on your GPU architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_39.txt b/chunked/nltk_chunking/_perf_hardware/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..217a6230e8541b93068cdc3412138958df00beed --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_39.txt @@ -0,0 +1 @@ +Let's compare the execution of a openai-community/gpt2 language model training over a small sample of wikitext. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_4.txt b/chunked/nltk_chunking/_perf_hardware/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..c37cb26c4b42c822704fd492f65a8504f7ad7db8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_4.txt @@ -0,0 +1,2 @@ +Power and Cooling +If you bought an expensive high end GPU make sure you give it the correct power and sufficient cooling. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_40.txt b/chunked/nltk_chunking/_perf_hardware/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b414389728acff50329c8905b586ba5ded8e2e4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_40.txt @@ -0,0 +1,6 @@ +The results are: +| NVlink | Time | +| ----- | ---: | +| Y | 101s | +| N | 131s | +You can see that NVLink completes the training ~23% faster. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_41.txt b/chunked/nltk_chunking/_perf_hardware/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..84c4ca278669234ec8b839ff8b4f665a8d121a23 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_41.txt @@ -0,0 +1 @@ +In the second benchmark we use NCCL_P2P_DISABLE=1 to tell the GPUs not to use NVLink. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_42.txt b/chunked/nltk_chunking/_perf_hardware/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..72882d938123b8bb6880a837e4f44dae8ecbac18 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_42.txt @@ -0,0 +1,17 @@ +Here is the full benchmark code and outputs: +```bash +DDP w/ NVLink +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 torchrun \ +--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \ +--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train \ +--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 +{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69} +DDP w/o NVLink +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 NCCL_P2P_DISABLE=1 torchrun \ +--nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py --model_name_or_path openai-community/gpt2 \ +--dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 --do_train +--output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 +{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69} + +Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (NV2 in nvidia-smi topo -m) +Software: pytorch-1.8-to-be + cuda-11.0 / transformers==4.3.0.dev0 \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_5.txt b/chunked/nltk_chunking/_perf_hardware/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..69bd34dd753d15a15d636ce411c1b3ab317875e9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_5.txt @@ -0,0 +1,2 @@ +Power: +Some high end consumer GPU cards have 2 and sometimes 3 PCI-E 8-Pin power sockets. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_6.txt b/chunked/nltk_chunking/_perf_hardware/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fde7063705f8c1ab8e1d5849ab385fdd6c91ef4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_6.txt @@ -0,0 +1 @@ +Make sure you have as many independent 12V PCI-E 8-Pin cables plugged into the card as there are sockets. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_7.txt b/chunked/nltk_chunking/_perf_hardware/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebfcbb9e236e3ed94308a101cb964ebae5a425ba --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_7.txt @@ -0,0 +1 @@ +Do not use the 2 splits at one end of the same cable (also known as pigtail cable). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_8.txt b/chunked/nltk_chunking/_perf_hardware/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e86189be0df4d93a3006204248fe9df38bb4740 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_8.txt @@ -0,0 +1 @@ +That is if you have 2 sockets on the GPU, you want 2 PCI-E 8-Pin cables going from your PSU to the card and not one that has 2 PCI-E 8-Pin connectors at the end! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_hardware/chunk_9.txt b/chunked/nltk_chunking/_perf_hardware/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b5d47de2d5ffc397f9c5d417c9179876c558e57 --- /dev/null +++ b/chunked/nltk_chunking/_perf_hardware/chunk_9.txt @@ -0,0 +1 @@ +You won't get the full performance out of your card otherwise. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_0.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..9fdee6c980e0dfd56f437b4324dc1acb4f97bed6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_0.txt @@ -0,0 +1,3 @@ + +CPU inference +With some optimizations, it is possible to efficiently run large model inference on a CPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_1.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..f07b4095f1820067418ab20594cdf5e7c5a1a8a9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_1.txt @@ -0,0 +1 @@ +One of these optimization techniques involves compiling the PyTorch code into an intermediate format for high-performance environments like C++. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_10.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..f57aee0e194c16caea59f59619d6fe0d193170a5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_10.txt @@ -0,0 +1 @@ +Before you start, make sure you have 🤗 Optimum installed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_11.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..f931f52c134adf415ba3ee9085cdf9d3efa455f2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_11.txt @@ -0,0 +1,8 @@ +Enable BetterTransformer with the [PreTrainedModel.to_bettertransformer] method: + +from transformers import AutoModelForCausalLM +model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder") +model.to_bettertransformer() + +TorchScript +TorchScript is an intermediate PyTorch model representation that can be run in production environments where performance is important. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_12.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d4a8af0484aab567fea019b7b30ee9683bce317 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_12.txt @@ -0,0 +1 @@ +You can train a model in PyTorch and then export it to TorchScript to free the model from Python performance constraints. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_13.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..613c23079bc903e11b8f72a951a01ea97d179cb2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_13.txt @@ -0,0 +1 @@ +PyTorch traces a model to return a [ScriptFunction] that is optimized with just-in-time compilation (JIT). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_14.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..181e85cad60948e99fed5b45c1a5f88609dbde35 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_14.txt @@ -0,0 +1 @@ +Compared to the default eager mode, JIT mode in PyTorch typically yields better performance for inference using optimization techniques like operator fusion. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_15.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c75dc2d568d56b56750608887e8d2e97e3349a1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_15.txt @@ -0,0 +1 @@ +For a gentle introduction to TorchScript, see the Introduction to PyTorch TorchScript tutorial. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_16.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1d393e9a6f4f35735dcbf195087cafcae4fa90b --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_16.txt @@ -0,0 +1,13 @@ +With the [Trainer] class, you can enable JIT mode for CPU inference by setting the --jit_mode_eval flag: + +python run_qa.py \ +--model_name_or_path csarron/bert-base-uncased-squad-v1 \ +--dataset_name squad \ +--do_eval \ +--max_seq_length 384 \ +--doc_stride 128 \ +--output_dir /tmp/ \ +--no_cuda \ +--jit_mode_eval + +For PyTorch >= 1.14.0, JIT-mode could benefit any model for prediction and evaluation since the dict input is supported in jit.trace. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_17.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..407aa761d843abb72a60d29a8c803375f9e567cd --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_17.txt @@ -0,0 +1 @@ +For PyTorch < 1.14.0, JIT-mode could benefit a model if its forward parameter order matches the tuple input order in jit.trace, such as a question-answering model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_18.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a21ba199947e07a748359de2d45b97fa0ca2599 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_18.txt @@ -0,0 +1 @@ +If the forward parameter order does not match the tuple input order in jit.trace, like a text classification model, jit.trace will fail and we are capturing this with the exception here to make it fallback. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_19.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..6179a2f42ea391fae7c528b05eeceda977b249f5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_19.txt @@ -0,0 +1 @@ +Logging is used to notify users. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_2.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..0571940822ae3852b11a0dab99fc3432532bb1ba --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_2.txt @@ -0,0 +1 @@ +The other technique fuses multiple operations into one kernel to reduce the overhead of running each operation separately. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_20.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..4ed37fcf877e3032d343dcaf72f9cea7eb279794 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_20.txt @@ -0,0 +1,2 @@ +IPEX graph optimization +Intel® Extension for PyTorch (IPEX) provides further optimizations in JIT mode for Intel CPUs, and we recommend combining it with TorchScript for even faster performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_21.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..15289bef1c12fc55ae911e6421822648cd7573d1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_21.txt @@ -0,0 +1 @@ +The IPEX graph optimization fuses operations like Multi-head attention, Concat Linear, Linear + Add, Linear + Gelu, Add + LayerNorm, and more. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_22.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..150a76b5cd93fde9194226a1417e8d0217bdd228 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_22.txt @@ -0,0 +1,18 @@ +To take advantage of these graph optimizations, make sure you have IPEX installed: + +pip install intel_extension_for_pytorch +Set the --use_ipex and --jit_mode_eval flags in the [Trainer] class to enable JIT mode with the graph optimizations: + +python run_qa.py \ +--model_name_or_path csarron/bert-base-uncased-squad-v1 \ +--dataset_name squad \ +--do_eval \ +--max_seq_length 384 \ +--doc_stride 128 \ +--output_dir /tmp/ \ +--no_cuda \ +--use_ipex \ +--jit_mode_eval +🤗 Optimum + +Learn more details about using ORT with 🤗 Optimum in the Optimum Inference with ONNX Runtime guide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_23.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef51a45648c567815c964174efe4feb0fd0b8e4b --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_23.txt @@ -0,0 +1 @@ +This section only provides a brief and simple example. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_24.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..bc9cb1fe3532bf80766dd67a3f260788bb8eaac0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_24.txt @@ -0,0 +1 @@ +ONNX Runtime (ORT) is a model accelerator that runs inference on CPUs by default. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_25.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..abad55440e7cee8a01a8309ec3ec848662b1efa2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_25.txt @@ -0,0 +1 @@ +ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers, without making too many changes to your code. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_26.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d77ff4bd614b86ac197c5a4ba3a83a16d2f1e9ab --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_26.txt @@ -0,0 +1 @@ +You only need to replace the 🤗 Transformers AutoClass with its equivalent [~optimum.onnxruntime.ORTModel] for the task you're solving, and load a checkpoint in the ONNX format. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_27.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..ae8663174331b89f7f76160283451427c6c44731 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_27.txt @@ -0,0 +1,8 @@ +For example, if you're running inference on a question answering task, load the optimum/roberta-base-squad2 checkpoint which contains a model.onnx file: + +from transformers import AutoTokenizer, pipeline +from optimum.onnxruntime import ORTModelForQuestionAnswering +model = ORTModelForQuestionAnswering.from_pretrained("optimum/roberta-base-squad2") +tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2") +onnx_qa = pipeline("question-answering", model=model, tokenizer=tokenizer) +question = "What's my name?" \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_28.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1207aecea370fa66cca85bccd32e6bfabca65b4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_28.txt @@ -0,0 +1 @@ +context = "My name is Philipp and I live in Nuremberg." \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_29.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d1d3fa8f602f6dff10362da832e4b341cc07d89 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_29.txt @@ -0,0 +1,3 @@ +pred = onnx_qa(question, context) + +If you have an Intel CPU, take a look at 🤗 Optimum Intel which supports a variety of compression techniques (quantization, pruning, knowledge distillation) and tools for converting models to the OpenVINO format for higher performance inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_3.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..186a2899c4b6fec69e73c18146630daa38288b47 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_3.txt @@ -0,0 +1 @@ +You'll learn how to use BetterTransformer for faster inference, and how to convert your PyTorch code to TorchScript. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_4.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ece112657011a8e74f3c90b7d620746839f9ff4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_4.txt @@ -0,0 +1 @@ +If you're using an Intel CPU, you can also use graph optimizations from Intel Extension for PyTorch to boost inference speed even more. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_5.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..73a31aadf618e0ca1dec31591229a972a8368d8c --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_5.txt @@ -0,0 +1 @@ +Finally, learn how to use 🤗 Optimum to accelerate inference with ONNX Runtime or OpenVINO (if you're using an Intel CPU). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_6.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..c53601b218c4541ff4cf6eededa41bbf428409df --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_6.txt @@ -0,0 +1,2 @@ +BetterTransformer +BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_7.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..1787fa4a17a858ee18aad88cd553e8eff206dab9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_7.txt @@ -0,0 +1,6 @@ +The two optimizations in the fastpath execution are: + +fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps +skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors + +BetterTransformer also converts all attention operations to use the more memory-efficient scaled dot product attention. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_8.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..a74012f5ba39a6acf6924a08690d1e6a63489fb0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_8.txt @@ -0,0 +1 @@ +BetterTransformer is not supported for all models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_cpu/chunk_9.txt b/chunked/nltk_chunking/_perf_infer_cpu/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..288504867ef9406968464bfc214b13da99a5c403 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_cpu/chunk_9.txt @@ -0,0 +1 @@ +Check this list to see if a model supports BetterTransformer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_0.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..21460a8a08f7be669237c5714104d078cd6498e3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_0.txt @@ -0,0 +1,3 @@ + +GPU inference +GPUs are the standard choice of hardware for machine learning, unlike CPUs, because they are optimized for memory bandwidth and parallelism. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_1.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..88bd3329b6b0a6a6c593ed44271029edb00d0a4e --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_1.txt @@ -0,0 +1 @@ +To keep up with the larger sizes of modern models or to run these large models on existing and older hardware, there are several optimizations you can use to speed up GPU inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_10.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..c8d0b12c85163ebed86aa2fc90a4c4b60aeb3869 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_10.txt @@ -0,0 +1 @@ +We strongly suggest using this Dockerfile to use FlashAttention-2 on AMD GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_11.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..722dba7c4f849b6c17929bd238cf654b33d5fca1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_11.txt @@ -0,0 +1,13 @@ +To enable FlashAttention-2, pass the argument attn_implementation="flash_attention_2" to [~AutoModelForCausalLM.from_pretrained]: +thon +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM +model_id = "tiiuae/falcon-7b" +tokenizer = AutoTokenizer.from_pretrained(model_id) +model = AutoModelForCausalLM.from_pretrained( + model_id, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", +) + +FlashAttention-2 can only be used when the model's dtype is fp16 or bf16. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_12.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6fe7665a32920ed5d158622a9fff35e79383ba8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_12.txt @@ -0,0 +1 @@ +Make sure to cast your model to the appropriate dtype and load them on a supported device before using FlashAttention-2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_13.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..83c9fe1df1a57f448d69b61750f503dd3fad2835 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_13.txt @@ -0,0 +1 @@ +You can also set use_flash_attention_2=True to enable FlashAttention-2 but it is deprecated in favor of attn_implementation="flash_attention_2". \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_14.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..7493076ac749557762aac411cf35fc506caec54a --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_14.txt @@ -0,0 +1 @@ +FlashAttention-2 can be combined with other optimization techniques like quantization to further speedup inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_15.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d1a1f4d0740fb4a18582d316323c13739b621ea --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_15.txt @@ -0,0 +1,21 @@ +For example, you can combine FlashAttention-2 with 8-bit or 4-bit quantization: + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM +model_id = "tiiuae/falcon-7b" +tokenizer = AutoTokenizer.from_pretrained(model_id) +load in 8bit +model = AutoModelForCausalLM.from_pretrained( + model_id, + load_in_8bit=True, + attn_implementation="flash_attention_2", +) +load in 4bit +model = AutoModelForCausalLM.from_pretrained( + model_id, + load_in_4bit=True, + attn_implementation="flash_attention_2", +) + +Expected speedups +You can benefit from considerable speedups for inference, especially for inputs with long sequences. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_16.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..379b1ca2e371c3ad01d3271c699af758f116aab0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_16.txt @@ -0,0 +1 @@ +However, since FlashAttention-2 does not support computing attention scores with padding tokens, you must manually pad/unpad the attention scores for batched inference when the sequence contains padding tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_17.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..dec908491a3814a254a9426df241e7fa4998f67b --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_17.txt @@ -0,0 +1 @@ +This leads to a significant slowdown for batched generations with padding tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_18.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..c07654dca9215dacb626fd30086e66ac925fb589 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_18.txt @@ -0,0 +1 @@ +To overcome this, you should use FlashAttention-2 without padding tokens in the sequence during training (by packing a dataset or concatenating sequences until reaching the maximum sequence length). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_19.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..d84c9f502c217aab46e497258c5fd84988528c49 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_19.txt @@ -0,0 +1,5 @@ +For a single forward pass on tiiuae/falcon-7b with a sequence length of 4096 and various batch sizes without padding tokens, the expected speedup is: + +For a single forward pass on meta-llama/Llama-7b-hf with a sequence length of 4096 and various batch sizes without padding tokens, the expected speedup is: + +For sequences with padding tokens (generating with padding tokens), you need to unpad/pad the input sequences to correctly compute the attention scores. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_2.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..38038732707830c1d6976405d24692a76c5f83f3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_2.txt @@ -0,0 +1 @@ +In this guide, you'll learn how to use FlashAttention-2 (a more memory-efficient attention mechanism), BetterTransformer (a PyTorch native fastpath execution), and bitsandbytes to quantize your model to a lower precision. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_20.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..72e1e88e762077bf11961b68ba9c25cc6cd58963 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_20.txt @@ -0,0 +1,5 @@ +With a relatively small sequence length, a single forward pass creates overhead leading to a small speedup (in the example below, 30% of the input is filled with padding tokens): + +But for larger sequence lengths, you can expect even more speedup benefits: + +FlashAttention is more memory efficient, meaning you can train on much larger sequence lengths without running into out-of-memory issues. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_21.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..f46739914a00dc67fa619730fde3c64bf5e852f4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_21.txt @@ -0,0 +1 @@ +You can potentially reduce memory usage up to 20x for larger sequence lengths. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_22.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..69d7cd7d4dce3b4d2200684ae72d08326395b734 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_22.txt @@ -0,0 +1 @@ +Take a look at the flash-attention repository for more details. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_23.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..45f1728934411146209083e942c6ca496acc0f39 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_23.txt @@ -0,0 +1,2 @@ +PyTorch scaled dot product attention +PyTorch's torch.nn.functional.scaled_dot_product_attention (SDPA) can also call FlashAttention and memory-efficient attention kernels under the hood. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_24.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d8ff91455979264f9724bfded6d7e8e4316bb1d --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_24.txt @@ -0,0 +1 @@ +SDPA support is currently being added natively in Transformers and is used by default for torch>=2.1.1 when an implementation is available. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_25.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..b50d66fdc28c28cee70df0bc50736097cfc96818 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_25.txt @@ -0,0 +1,13 @@ +For now, Transformers supports SDPA inference and training for the following architectures: +* Bart +* GPTBigCode +* Falcon +* Llama +* Phi +* Idefics +* Whisper +* Mistral +* Mixtral +* Qwen2 + +FlashAttention can only be used for models with the fp16 or bf16 torch type, so make sure to cast your model to the appropriate type first. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_26.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..65b351ae6426bef28ef98b9e0c8d9895fbc17386 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_26.txt @@ -0,0 +1,19 @@ +By default, SDPA selects the most performant kernel available but you can check whether a backend is available in a given setting (hardware, problem size) with torch.backends.cuda.sdp_kernel as a context manager: + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16).to("cuda") +convert the model to BetterTransformer +model.to_bettertransformer() +input_text = "Hello my dog is cute and" +inputs = tokenizer(input_text, return_tensors="pt").to("cuda") + +with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + outputs = model.generate(**inputs) + +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) + +If you see a bug with the traceback below, try using the nightly version of PyTorch which may have broader coverage for FlashAttention: +```bash +RuntimeError: No available kernel. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_27.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecd4b6693234ae16cbcbc52d16027607326ccb84 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_27.txt @@ -0,0 +1 @@ +Aborting execution. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_28.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..406cc6ad9d7db7509dc763d6a423112cd817b221 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_28.txt @@ -0,0 +1,6 @@ +install PyTorch nightly +pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118 + +BetterTransformer + +Some BetterTransformer features are being upstreamed to Transformers with default support for native torch.nn.scaled_dot_product_attention. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_29.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..397bc99550c221498db8cf60bc741c9a3d9c9d81 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_29.txt @@ -0,0 +1 @@ +BetterTransformer still has a wider coverage than the Transformers SDPA integration, but you can expect more and more architectures to natively support SDPA in Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_3.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..467f46d86c233421ee007c4f6bb6b6d79b3af892 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_3.txt @@ -0,0 +1 @@ +Finally, learn how to use 🤗 Optimum to accelerate inference with ONNX Runtime on Nvidia and AMD GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_30.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..15e1309d37f150f57728866b9bb9d9f7d8061bdc --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_30.txt @@ -0,0 +1 @@ +Check out our benchmarks with BetterTransformer and scaled dot product attention in the Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0 and learn more about the fastpath execution in the BetterTransformer blog post. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_31.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..c63cee4146c82d6aa97537876587db53cd5a01d2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_31.txt @@ -0,0 +1 @@ +BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_32.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cac088de642942db63fb842d74ddcee863fe7d5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_32.txt @@ -0,0 +1,6 @@ +The two optimizations in the fastpath execution are: + +fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps +skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors + +BetterTransformer also converts all attention operations to use the more memory-efficient scaled dot product attention (SDPA), and it calls optimized kernels like FlashAttention under the hood. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_33.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..f57aee0e194c16caea59f59619d6fe0d193170a5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_33.txt @@ -0,0 +1 @@ +Before you start, make sure you have 🤗 Optimum installed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_34.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..939b22bfed76180fc4d211fd48af64d8c45bab51 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_34.txt @@ -0,0 +1,4 @@ +Then you can enable BetterTransformer with the [PreTrainedModel.to_bettertransformer] method: +python +model = model.to_bettertransformer() +You can return the original Transformers model with the [~PreTrainedModel.reverse_bettertransformer] method. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_35.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..92a6b7fe880c56a7c3a371e2b43f5fe0e4f1e736 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_35.txt @@ -0,0 +1,6 @@ +You should use this before saving your model to use the canonical Transformers modeling: +py +model = model.reverse_bettertransformer() +model.save_pretrained("saved_model") +bitsandbytes +bitsandbytes is a quantization library that includes support for 4-bit and 8-bit quantization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_36.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e23723b0159f5f67746b8e9f54dbac6c83eec1d --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_36.txt @@ -0,0 +1 @@ +Quantization reduces your model size compared to its native full precision version, making it easier to fit large models onto GPUs with limited memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_37.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7d8d2b4b9c890c5e05ef23377da4bfc44597cc7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_37.txt @@ -0,0 +1,9 @@ +Make sure you have bitsandbytes and 🤗 Accelerate installed: +```bash +these versions support 8-bit and 4-bit +pip install bitsandbytes>=0.39.0 accelerate>=0.20.0 +install Transformers +pip install transformers + +4-bit +To load a model in 4-bit for inference, use the load_in_4bit parameter. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_38.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..d79a21d8257d5b431a75593030fb6e0ced628642 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_38.txt @@ -0,0 +1 @@ +The device_map parameter is optional, but we recommend setting it to "auto" to allow 🤗 Accelerate to automatically and efficiently allocate the model given the available resources in the environment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_39.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecdbf2987409e0473f4dfe736543ee417e339f0e --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_39.txt @@ -0,0 +1,5 @@ +from transformers import AutoModelForCausalLM +model_name = "bigscience/bloom-2b5" +model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True) + +To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_4.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..80d2732356e72b2eb15c337958b86829f84aca11 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_4.txt @@ -0,0 +1 @@ +The majority of the optimizations described here also apply to multi-GPU setups! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_40.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..209e75a16c4a7544b31e9f00a12c6492756e4d43 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_40.txt @@ -0,0 +1,10 @@ +For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU: +py +max_memory_mapping = {0: "600MB", 1: "1GB"} +model_name = "bigscience/bloom-3b" +model_4bit = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping +) +8-bit + +If you're curious and interested in learning more about the concepts underlying 8-bit quantization, read the Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes blog post. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_41.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..24f72b8ad3d0265dc4953a6ec13d72b64d5af139 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_41.txt @@ -0,0 +1 @@ +To load a model in 8-bit for inference, use the load_in_8bit parameter. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_42.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..99b6c2a7858193160f96e3ac7b7015f736f44a48 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_42.txt @@ -0,0 +1,7 @@ +The device_map parameter is optional, but we recommend setting it to "auto" to allow 🤗 Accelerate to automatically and efficiently allocate the model given the available resources in the environment: + +from transformers import AutoModelForCausalLM +model_name = "bigscience/bloom-2b5" +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) + +If you're loading a model in 8-bit for text generation, you should use the [~transformers.GenerationMixin.generate] method instead of the [Pipeline] function which is not optimized for 8-bit models and will be slower. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_43.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..a211224317f81fb0a043bedbe2c3eb7b12e12a0c --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_43.txt @@ -0,0 +1 @@ +Some sampling strategies, like nucleus sampling, are also not supported by the [Pipeline] for 8-bit models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_44.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..de684a8821f12e2a9eb4637d498f53aaf0791687 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_44.txt @@ -0,0 +1,12 @@ +You should also place all inputs on the same device as the model: + +from transformers import AutoModelForCausalLM, AutoTokenizer +model_name = "bigscience/bloom-2b5" +tokenizer = AutoTokenizer.from_pretrained(model_name) +model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True) +prompt = "Hello, my llama is cute" +inputs = tokenizer(prompt, return_tensors="pt").to("cuda") +generated_ids = model.generate(**inputs) +outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) + +To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_45.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc78505b8aaa6227b9c4106ed94737805836d463 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_45.txt @@ -0,0 +1,9 @@ +For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU: +py +max_memory_mapping = {0: "1GB", 1: "2GB"} +model_name = "bigscience/bloom-3b" +model_8bit = AutoModelForCausalLM.from_pretrained( + model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping +) + +Feel free to try running a 11 billion parameter T5 model or the 3 billion parameter BLOOM model for inference on Google Colab's free tier GPUs! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_46.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..5fc4cb95e11357167973d8d531f4b9f589294a18 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_46.txt @@ -0,0 +1,3 @@ +🤗 Optimum + +Learn more details about using ORT with 🤗 Optimum in the Accelerated inference on NVIDIA GPUs and Accelerated inference on AMD GPUs guides. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_47.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef51a45648c567815c964174efe4feb0fd0b8e4b --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_47.txt @@ -0,0 +1 @@ +This section only provides a brief and simple example. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_48.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..914ab93575f3dba8c8e64a0ff8c6ebbc918ce8c1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_48.txt @@ -0,0 +1 @@ +ONNX Runtime (ORT) is a model accelerator that supports accelerated inference on Nvidia GPUs, and AMD GPUs that use ROCm stack. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_49.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..881df7fa74c06ce7658010e3170e87d6ee936d37 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_49.txt @@ -0,0 +1 @@ +ORT uses optimization techniques like fusing common operations into a single node and constant folding to reduce the number of computations performed and speedup inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_5.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1ddd23aca269c6d4bab5974a00e0fd03d043952 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_5.txt @@ -0,0 +1,3 @@ +FlashAttention-2 + +FlashAttention-2 is experimental and may change considerably in future versions. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_50.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6a3b3b502a51235035fb7e65df1da9a5aa0f6a3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_50.txt @@ -0,0 +1 @@ +ORT also places the most computationally intensive operations on the GPU and the rest on the CPU to intelligently distribute the workload between the two devices. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_51.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..f67458d851172d7b2df58ee091b02737a63a8691 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_51.txt @@ -0,0 +1 @@ +ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_52.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..668be4ac8c0e63f11c28dfc0e6c072c3d387f780 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_52.txt @@ -0,0 +1 @@ +You'll need to use an [~optimum.onnxruntime.ORTModel] for the task you're solving, and specify the provider parameter which can be set to either CUDAExecutionProvider, ROCMExecutionProvider or TensorrtExecutionProvider. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_53.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..93e8aee7cfe906fbaa0afb93ea807ba3f6574ef7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_53.txt @@ -0,0 +1,16 @@ +If you want to load a model that was not yet exported to ONNX, you can set export=True to convert your model on-the-fly to the ONNX format: + +from optimum.onnxruntime import ORTModelForSequenceClassification +ort_model = ORTModelForSequenceClassification.from_pretrained( + "distilbert/distilbert-base-uncased-finetuned-sst-2-english", + export=True, + provider="CUDAExecutionProvider", +) + +Now you're free to use the model for inference: + +from optimum.pipelines import pipeline +from transformers import AutoTokenizer +tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased-finetuned-sst-2-english") +pipeline = pipeline(task="text-classification", model=ort_model, tokenizer=tokenizer, device="cuda:0") +result = pipeline("Both the music and visual were astounding, not to mention the actors performance.") \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_54.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c8bc1e2006f89cb8c6422935273a2ef1006f858 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_54.txt @@ -0,0 +1,2 @@ +Combine optimizations +It is often possible to combine several of the optimization techniques described above to get the best inference performance possible for your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_55.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..f739613aaf689b704f1e9ddc7ee915dc77cb303f --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_55.txt @@ -0,0 +1,20 @@ +For example, you can load a model in 4-bit, and then enable BetterTransformer with FlashAttention: + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +load model in 4-bit +quantization_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16 +) +tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m") +model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config) +enable BetterTransformer +model = model.to_bettertransformer() +input_text = "Hello my dog is cute and" +inputs = tokenizer(input_text, return_tensors="pt").to("cuda") +enable FlashAttention +with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False): + outputs = model.generate(**inputs) +print(tokenizer.decode(outputs[0], skip_special_tokens=True)) +``` \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_6.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fa69678cedeacfc6ece833c7a326fe2951738a7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_6.txt @@ -0,0 +1,25 @@ +FlashAttention-2 is a faster and more efficient implementation of the standard attention mechanism that can significantly speedup inference by: + +additionally parallelizing the attention computation over sequence length +partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them + +FlashAttention-2 is currently supported for the following architectures: +* Bark +* Bart +* DistilBert +* GPTBigCode +* GPTNeo +* GPTNeoX +* Falcon +* Llama +* Llava +* VipLlava +* MBart +* Mistral +* Mixtral +* OPT +* Phi +* StableLm +* Qwen2 +* Whisper +You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_7.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..11dab5ad6e0943da9beef819835d0a096d98c37e --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_7.txt @@ -0,0 +1 @@ +Before you begin, make sure you have FlashAttention-2 installed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_8.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..116eb618e8ad23807507609b4825aa42a2c20ec5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_8.txt @@ -0,0 +1,2 @@ +pip install flash-attn --no-build-isolation +We strongly suggest referring to the detailed installation instructions to learn more about supported hardware and data types! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_9.txt b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..d33abe9c11bd698f83472b7e78e6be453e863dc2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_infer_gpu_one/chunk_9.txt @@ -0,0 +1 @@ +FlashAttention-2 is also supported on AMD GPUs and current support is limited to Instinct MI210 and Instinct MI250. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_0.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..b1f1d5fb96b0f8a95ab97786c83f3dca41c73d0b --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_0.txt @@ -0,0 +1,3 @@ + +Optimize inference using torch.compile() +This guide aims to provide a benchmark on the inference speed-ups introduced with torch.compile() for computer vision models in 🤗 Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_1.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..c70e57302190b4e88216c5a04a0f9f639c918cc8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_1.txt @@ -0,0 +1,2 @@ +Benefits of torch.compile +Depending on the model and the GPU, torch.compile() yields up to 30% speed-up during inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_10.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb215719551bb3fe33b9ceb3a20b72827d5e12fb --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_10.txt @@ -0,0 +1,2 @@ +Benchmarking code +Below you can find the benchmarking code for each task. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_11.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f1adfe2438f28e531ccdccc707ec13fb8e12343 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_11.txt @@ -0,0 +1 @@ +We warm up the GPU before inference and take the mean time of 300 inferences, using the same image each time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_12.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a3e9ab1b05e640e921f865fa36e65ef2f46ce45 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_12.txt @@ -0,0 +1,38 @@ +Image Classification with ViT +thon +import torch +from PIL import Image +import requests +import numpy as np +from transformers import AutoImageProcessor, AutoModelForImageClassification +url = 'http://images.cocodataset.org/val2017/000000039769.jpg' +image = Image.open(requests.get(url, stream=True).raw) +processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224") +model = AutoModelForImageClassification.from_pretrained("google/vit-base-patch16-224").to("cuda") +model = torch.compile(model) +processed_input = processor(image, return_tensors='pt').to(device="cuda") +with torch.no_grad(): + _ = model(**processed_input) + +Object Detection with DETR +thon +from transformers import AutoImageProcessor, AutoModelForObjectDetection +processor = AutoImageProcessor.from_pretrained("facebook/detr-resnet-50") +model = AutoModelForObjectDetection.from_pretrained("facebook/detr-resnet-50").to("cuda") +model = torch.compile(model) +texts = ["a photo of a cat", "a photo of a dog"] +inputs = processor(text=texts, images=image, return_tensors="pt").to("cuda") +with torch.no_grad(): + _ = model(**inputs) + +Image Segmentation with Segformer +thon +from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation +processor = SegformerImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512") +model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512").to("cuda") +model = torch.compile(model) +seg_inputs = processor(images=image, return_tensors="pt").to("cuda") +with torch.no_grad(): + _ = model(**seg_inputs) + +Below you can find the list of the models we benchmarked. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_13.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e949f85ff5e176f34b563cd94860bec9ab27319 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_13.txt @@ -0,0 +1,15 @@ +Image Classification +- google/vit-base-patch16-224 +- microsoft/beit-base-patch16-224-pt22k-ft22k +- facebook/convnext-large-224 +- microsoft/resnet-50 +Image Segmentation +- nvidia/segformer-b0-finetuned-ade-512-512 +- facebook/mask2former-swin-tiny-coco-panoptic +- facebook/maskformer-swin-base-ade +- google/deeplabv3_mobilenet_v2_1.0_513 +Object Detection +- google/owlvit-base-patch32 +- facebook/detr-resnet-101 +- microsoft/conditional-detr-resnet-50 +Below you can find visualization of inference durations with and without torch.compile() and percentage improvements for each model in different hardware and batch sizes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_14.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..dbb4429b3eb45a439fb7f52e72f928ea6ce47c44 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_14.txt @@ -0,0 +1 @@ +Below you can find inference durations in milliseconds for each model with and without compile(). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_15.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..a8d07bebf2f5320fcf71b5f4003cbb4599afc629 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_15.txt @@ -0,0 +1 @@ +Note that OwlViT results in OOM in larger batch sizes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_16.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..5c14c7e64d9c883a7f9929f64d7b42ab12edaee3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_16.txt @@ -0,0 +1,131 @@ +A100 (batch size: 1) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 9.325 | 7.584 | +| Image Segmentation/Segformer | 11.759 | 10.500 | +| Object Detection/OwlViT | 24.978 | 18.420 | +| Image Classification/BeiT | 11.282 | 8.448 | +| Object Detection/DETR | 34.619 | 19.040 | +| Image Classification/ConvNeXT | 10.410 | 10.208 | +| Image Classification/ResNet | 6.531 | 4.124 | +| Image Segmentation/Mask2former | 60.188 | 49.117 | +| Image Segmentation/Maskformer | 75.764 | 59.487 | +| Image Segmentation/MobileNet | 8.583 | 3.974 | +| Object Detection/Resnet-101 | 36.276 | 18.197 | +| Object Detection/Conditional-DETR | 31.219 | 17.993 | +A100 (batch size: 4) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 14.832 | 14.499 | +| Image Segmentation/Segformer | 18.838 | 16.476 | +| Image Classification/BeiT | 13.205 | 13.048 | +| Object Detection/DETR | 48.657 | 32.418| +| Image Classification/ConvNeXT | 22.940 | 21.631 | +| Image Classification/ResNet | 6.657 | 4.268 | +| Image Segmentation/Mask2former | 74.277 | 61.781 | +| Image Segmentation/Maskformer | 180.700 | 159.116 | +| Image Segmentation/MobileNet | 14.174 | 8.515 | +| Object Detection/Resnet-101 | 68.101 | 44.998 | +| Object Detection/Conditional-DETR | 56.470 | 35.552 | +A100 (batch size: 16) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 40.944 | 40.010 | +| Image Segmentation/Segformer | 37.005 | 31.144 | +| Image Classification/BeiT | 41.854 | 41.048 | +| Object Detection/DETR | 164.382 | 161.902 | +| Image Classification/ConvNeXT | 82.258 | 75.561 | +| Image Classification/ResNet | 7.018 | 5.024 | +| Image Segmentation/Mask2former | 178.945 | 154.814 | +| Image Segmentation/Maskformer | 638.570 | 579.826 | +| Image Segmentation/MobileNet | 51.693 | 30.310 | +| Object Detection/Resnet-101 | 232.887 | 155.021 | +| Object Detection/Conditional-DETR | 180.491 | 124.032 | +V100 (batch size: 1) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 10.495 | 6.00 | +| Image Segmentation/Segformer | 13.321 | 5.862 | +| Object Detection/OwlViT | 25.769 | 22.395 | +| Image Classification/BeiT | 11.347 | 7.234 | +| Object Detection/DETR | 33.951 | 19.388 | +| Image Classification/ConvNeXT | 11.623 | 10.412 | +| Image Classification/ResNet | 6.484 | 3.820 | +| Image Segmentation/Mask2former | 64.640 | 49.873 | +| Image Segmentation/Maskformer | 95.532 | 72.207 | +| Image Segmentation/MobileNet | 9.217 | 4.753 | +| Object Detection/Resnet-101 | 52.818 | 28.367 | +| Object Detection/Conditional-DETR | 39.512 | 20.816 | +V100 (batch size: 4) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 15.181 | 14.501 | +| Image Segmentation/Segformer | 16.787 | 16.188 | +| Image Classification/BeiT | 15.171 | 14.753 | +| Object Detection/DETR | 88.529 | 64.195 | +| Image Classification/ConvNeXT | 29.574 | 27.085 | +| Image Classification/ResNet | 6.109 | 4.731 | +| Image Segmentation/Mask2former | 90.402 | 76.926 | +| Image Segmentation/Maskformer | 234.261 | 205.456 | +| Image Segmentation/MobileNet | 24.623 | 14.816 | +| Object Detection/Resnet-101 | 134.672 | 101.304 | +| Object Detection/Conditional-DETR | 97.464 | 69.739 | +V100 (batch size: 16) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 52.209 | 51.633 | +| Image Segmentation/Segformer | 61.013 | 55.499 | +| Image Classification/BeiT | 53.938 | 53.581 | +| Object Detection/DETR | OOM | OOM | +| Image Classification/ConvNeXT | 109.682 | 100.771 | +| Image Classification/ResNet | 14.857 | 12.089 | +| Image Segmentation/Mask2former | 249.605 | 222.801 | +| Image Segmentation/Maskformer | 831.142 | 743.645 | +| Image Segmentation/MobileNet | 93.129 | 55.365 | +| Object Detection/Resnet-101 | 482.425 | 361.843 | +| Object Detection/Conditional-DETR | 344.661 | 255.298 | +T4 (batch size: 1) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 16.520 | 15.786 | +| Image Segmentation/Segformer | 16.116 | 14.205 | +| Object Detection/OwlViT | 53.634 | 51.105 | +| Image Classification/BeiT | 16.464 | 15.710 | +| Object Detection/DETR | 73.100 | 53.99 | +| Image Classification/ConvNeXT | 32.932 | 30.845 | +| Image Classification/ResNet | 6.031 | 4.321 | +| Image Segmentation/Mask2former | 79.192 | 66.815 | +| Image Segmentation/Maskformer | 200.026 | 188.268 | +| Image Segmentation/MobileNet | 18.908 | 11.997 | +| Object Detection/Resnet-101 | 106.622 | 82.566 | +| Object Detection/Conditional-DETR | 77.594 | 56.984 | +T4 (batch size: 4) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 43.653 | 43.626 | +| Image Segmentation/Segformer | 45.327 | 42.445 | +| Image Classification/BeiT | 52.007 | 51.354 | +| Object Detection/DETR | 277.850 | 268.003 | +| Image Classification/ConvNeXT | 119.259 | 105.580 | +| Image Classification/ResNet | 13.039 | 11.388 | +| Image Segmentation/Mask2former | 201.540 | 184.670 | +| Image Segmentation/Maskformer | 764.052 | 711.280 | +| Image Segmentation/MobileNet | 74.289 | 48.677 | +| Object Detection/Resnet-101 | 421.859 | 357.614 | +| Object Detection/Conditional-DETR | 289.002 | 226.945 | +T4 (batch size: 16) +| Task/Model | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:| +| Image Classification/ViT | 163.914 | 160.907 | +| Image Segmentation/Segformer | 192.412 | 163.620 | +| Image Classification/BeiT | 188.978 | 187.976 | +| Object Detection/DETR | OOM | OOM | +| Image Classification/ConvNeXT | 422.886 | 388.078 | +| Image Classification/ResNet | 44.114 | 37.604 | +| Image Segmentation/Mask2former | 756.337 | 695.291 | +| Image Segmentation/Maskformer | 2842.940 | 2656.88 | +| Image Segmentation/MobileNet | 299.003 | 201.942 | +| Object Detection/Resnet-101 | 1619.505 | 1262.758 | +| Object Detection/Conditional-DETR | 1137.513 | 897.390| +PyTorch Nightly +We also benchmarked on PyTorch nightly (2.1.0dev, find the wheel here) and observed improvement in latency both for uncompiled and compiled models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_17.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..be524d2373f840b06772e2639399729546a7c800 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_17.txt @@ -0,0 +1,29 @@ +A100 +| Task/Model | Batch Size | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:|:---:| +| Image Classification/BeiT | Unbatched | 12.462 | 6.954 | +| Image Classification/BeiT | 4 | 14.109 | 12.851 | +| Image Classification/BeiT | 16 | 42.179 | 42.147 | +| Object Detection/DETR | Unbatched | 30.484 | 15.221 | +| Object Detection/DETR | 4 | 46.816 | 30.942 | +| Object Detection/DETR | 16 | 163.749 | 163.706 | +T4 +| Task/Model | Batch Size | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:|:---:| +| Image Classification/BeiT | Unbatched | 14.408 | 14.052 | +| Image Classification/BeiT | 4 | 47.381 | 46.604 | +| Image Classification/BeiT | 16 | 42.179 | 42.147 | +| Object Detection/DETR | Unbatched | 68.382 | 53.481 | +| Object Detection/DETR | 4 | 269.615 | 204.785 | +| Object Detection/DETR | 16 | OOM | OOM | +V100 +| Task/Model | Batch Size | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:|:---:| +| Image Classification/BeiT | Unbatched | 13.477 | 7.926 | +| Image Classification/BeiT | 4 | 15.103 | 14.378 | +| Image Classification/BeiT | 16 | 52.517 | 51.691 | +| Object Detection/DETR | Unbatched | 28.706 | 19.077 | +| Object Detection/DETR | 4 | 88.402 | 62.949| +| Object Detection/DETR | 16 | OOM | OOM | +Reduce Overhead +We benchmarked reduce-overhead compilation mode for A100 and T4 in Nightly. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_18.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4ed1d82c694cfa3765b2ca7434d09bb1e8d8a3f --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_18.txt @@ -0,0 +1,22 @@ +A100 +| Task/Model | Batch Size | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:|:---:| +| Image Classification/ConvNeXT | Unbatched | 11.758 | 7.335 | +| Image Classification/ConvNeXT | 4 | 23.171 | 21.490 | +| Image Classification/ResNet | Unbatched | 7.435 | 3.801 | +| Image Classification/ResNet | 4 | 7.261 | 2.187 | +| Object Detection/Conditional-DETR | Unbatched | 32.823 | 11.627 | +| Object Detection/Conditional-DETR | 4 | 50.622 | 33.831 | +| Image Segmentation/MobileNet | Unbatched | 9.869 | 4.244 | +| Image Segmentation/MobileNet | 4 | 14.385 | 7.946 | +T4 +| Task/Model | Batch Size | torch 2.0 - no compile | torch 2.0 - compile | +|:---:|:---:|:---:|:---:| +| Image Classification/ConvNeXT | Unbatched | 32.137 | 31.84 | +| Image Classification/ConvNeXT | 4 | 120.944 | 110.209 | +| Image Classification/ResNet | Unbatched | 9.761 | 7.698 | +| Image Classification/ResNet | 4 | 15.215 | 13.871 | +| Object Detection/Conditional-DETR | Unbatched | 72.150 | 57.660 | +| Object Detection/Conditional-DETR | 4 | 301.494 | 247.543 | +| Image Segmentation/MobileNet | Unbatched | 22.266 | 19.339 | +| Image Segmentation/MobileNet | 4 | 78.311 | 50.983 | \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_2.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c21c526a4d641dba98af523641b6b23e812cff6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_2.txt @@ -0,0 +1 @@ +To use torch.compile(), simply install any version of torch above 2.0. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_3.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8798f704d7cdbe869cfc4a61f2872385d61f022 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_3.txt @@ -0,0 +1 @@ +Compiling a model takes time, so it's useful if you are compiling the model only once instead of every time you infer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_4.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..c6761dc4df4dd7e1de6c664fbf9785376c173c3f --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_4.txt @@ -0,0 +1,7 @@ +To compile any computer vision model of your choice, call torch.compile() on the model as shown below: + +from transformers import AutoModelForImageClassification +model = AutoModelForImageClassification.from_pretrained(MODEL_ID).to("cuda") ++ model = torch.compile(model) + +compile() comes with multiple modes for compiling, which essentially differ in compilation time and inference overhead. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_5.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc83358d0cdf72cd09cf25a9779c2b4f04272fa5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_5.txt @@ -0,0 +1 @@ +max-autotune takes longer than reduce-overhead but results in faster inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_6.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..57afa949de6391c3aa511811277751b1f8324c03 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_6.txt @@ -0,0 +1 @@ +Default mode is fastest for compilation but is not as efficient compared to reduce-overhead for inference time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_7.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..07066ff9e037aef8d24e5e1fc3695222fba9ddc6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_7.txt @@ -0,0 +1 @@ +In this guide, we used the default mode. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_8.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e8ea0d6388d834909d24f3cc664d7c0570ffd2f --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_8.txt @@ -0,0 +1 @@ +You can learn more about it here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_torch_compile/chunk_9.txt b/chunked/nltk_chunking/_perf_torch_compile/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..499274be601623aa6e82db9ca477e3bfed13be4e --- /dev/null +++ b/chunked/nltk_chunking/_perf_torch_compile/chunk_9.txt @@ -0,0 +1 @@ +We benchmarked torch.compile with different computer vision models, tasks, types of hardware, and batch sizes on torch version 2.0.1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_0.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac2036a4d0fc6dca2349dc64c57c0ea7a961552d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_0.txt @@ -0,0 +1,3 @@ + +Efficient Training on CPU +This guide focuses on training large models efficiently on CPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_1.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..177b0f06628d24ef0866ccc5beb17aede900af58 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_1.txt @@ -0,0 +1,2 @@ +Mixed precision with IPEX +Mixed precision uses single (fp32) and half-precision (bf16/fp16) data types in a model to accelerate training or inference while still preserving much of the single-precision accuracy. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_10.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..4cedec3795429275a6d663f03f6a0c3f8e120cd7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_10.txt @@ -0,0 +1,2 @@ +pip install intel_extension_for_pytorch== -f https://developer.intel.com/ipex-whl-stable-cpu +You can check the latest versions in ipex-whl-stable-cpu if needed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_11.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fecbc48482b633374d5d8eee37708d75be36f78 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_11.txt @@ -0,0 +1 @@ +Check more approaches for IPEX installation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_12.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..052fd4e10aaf8ff1232a98287bcd12a70ee7a0e1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_12.txt @@ -0,0 +1,2 @@ +Usage in Trainer +To enable auto mixed precision with IPEX in Trainer, users should add use_ipex, bf16 and no_cuda in training command arguments. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_13.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4ff12c7020bb874828d9dd312d0272cea05bf86 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_13.txt @@ -0,0 +1,29 @@ +Take an example of the use cases on Transformers question-answering + +Training with IPEX using BF16 auto mixed precision on CPU: + + python run_qa.py \ +--model_name_or_path google-bert/bert-base-uncased \ +--dataset_name squad \ +--do_train \ +--do_eval \ +--per_device_train_batch_size 12 \ +--learning_rate 3e-5 \ +--num_train_epochs 2 \ +--max_seq_length 384 \ +--doc_stride 128 \ +--output_dir /tmp/debug_squad/ \ +--use_ipex \ +--bf16 \ +--use_cpu +If you want to enable use_ipex and bf16 in your script, add these parameters to TrainingArguments like this: +diff +training_args = TrainingArguments( + output_dir=args.output_path, ++ bf16=True, ++ use_ipex=True, ++ use_cpu=True, + **kwargs +) +Practice example +Blog: Accelerating PyTorch Transformers with Intel Sapphire Rapids \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_2.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..b0aaab3687c061202d10bf085059c9548e51fd0c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_2.txt @@ -0,0 +1 @@ +Modern CPUs such as 3rd and 4th Gen Intel® Xeon® Scalable processors natively support bf16, so you should get more performance out of the box by enabling mixed precision training with bf16. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_3.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a3169935ddd591952cd993740fa4af1593f5fd2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_3.txt @@ -0,0 +1 @@ +To further maximize training performance, you can use Intel® Extension for PyTorch (IPEX), which is a library built on PyTorch and adds additional CPU instruction level architecture (ISA) level support such as Intel® Advanced Vector Extensions 512 Vector Neural Network Instructions (Intel® AVX512-VNNI), and Intel® Advanced Matrix Extensions (Intel® AMX) for an extra performance boost on Intel CPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_4.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..495bc1c1ba43b3c14fe1568a494e91891c04a788 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_4.txt @@ -0,0 +1 @@ +However, CPUs with only AVX2 (e.g., AMD or older Intel CPUs) are not guaranteed to have better performance under IPEX. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_5.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..d869cd4f02404cf38d7fad9dcfd9efc978ab90b2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_5.txt @@ -0,0 +1 @@ +Auto Mixed Precision (AMP) for CPU backends has been enabled since PyTorch 1.10. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_6.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd66b618187a8e3ad3b098ea2152da1051a07f78 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_6.txt @@ -0,0 +1 @@ +AMP support for bf16 on CPUs and bf16 operator optimization is also supported in IPEX and partially upstreamed to the main PyTorch branch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_7.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..88db5d5c2a500f6e164bae9275261beef5aa57c3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_7.txt @@ -0,0 +1 @@ +You can get better performance and user experience with IPEX AMP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_8.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..5214c9dafecbd9ee3a2b7cc0aebfb0e98e6f0ad6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_8.txt @@ -0,0 +1 @@ +Check more detailed information for Auto Mixed Precision. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu/chunk_9.txt b/chunked/nltk_chunking/_perf_train_cpu/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..661b22aa14cf043a61cde7998561bbf20cdf9d77 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu/chunk_9.txt @@ -0,0 +1,9 @@ +IPEX installation: +IPEX release is following PyTorch, to install via pip: +| PyTorch Version | IPEX version | +| :---------------: | :----------: | +| 2.1.x | 2.1.100+cpu | +| 2.0.x | 2.0.100+cpu | +| 1.13 | 1.13.0+cpu | +| 1.12 | 1.12.300+cpu | +Please run pip list | grep torch to get your pytorch_version, so you can get the IPEX version_name. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_0.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..949655dce1bc4ba529ed5ae4ac1a684de11db902 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_0.txt @@ -0,0 +1,3 @@ + +Efficient Training on Multiple CPUs +When training on a single CPU is too slow, we can use multiple CPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_1.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba071e05e02509a2375f477aaf59d6ef9757c4ab --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_1.txt @@ -0,0 +1,2 @@ +This guide focuses on PyTorch-based DDP enabling +distributed CPU training efficiently on bare metal and Kubernetes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_10.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6966750fbde10971c37a1eefa248e86278fe1c61 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_10.txt @@ -0,0 +1 @@ +This component is part of the Intel® oneAPI HPC Toolkit. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_11.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d0119dbfe3ac862fb4eb8b4bbcdd68647a8eef1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_11.txt @@ -0,0 +1 @@ +oneccl_bindings_for_pytorch is installed along with the MPI tool set. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_12.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..6d0eed4dc44bb1d5c84b244c20d0c3e063590427 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_12.txt @@ -0,0 +1 @@ +Need to source the environment before using it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_13.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..5575dbe68cfa045a498f526fcc2d25b4e63ff1fb --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_13.txt @@ -0,0 +1,10 @@ +for Intel® oneCCL >= 1.12.0 + +oneccl_bindings_for_pytorch_path=$(python -c "from oneccl_bindings_for_pytorch import cwd; print(cwd)") +source $oneccl_bindings_for_pytorch_path/env/setvars.sh +for Intel® oneCCL whose version < 1.12.0 + +torch_ccl_path=$(python -c "import torch; import torch_ccl; import os; print(os.path.abspath(os.path.dirname(torch_ccl.__file__)))") +source $torch_ccl_path/env/setvars.sh +Intel® Extension for PyTorch installation +Intel Extension for PyTorch (IPEX) provides performance optimizations for CPU training with both Float32 and BFloat16 (refer to the single CPU section to learn more). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_14.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ea4f24f855527f3fdd342a8b1fe56cf93b0bacd --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_14.txt @@ -0,0 +1 @@ +The following "Usage in Trainer" takes mpirun in Intel® MPI library as an example. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_15.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..3aa0e92aabc924b564b1c820020cbc9f53377902 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_15.txt @@ -0,0 +1,2 @@ +Usage in Trainer +To enable multi CPU distributed training in the Trainer with the ccl backend, users should add --ddp_backend ccl in the command arguments. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_16.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..0bd3d3248d3001114a5486d2cea931ba45530861 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_16.txt @@ -0,0 +1,2 @@ +Let's see an example with the question-answering example +The following command enables training with 2 processes on one Xeon node, with one process running per one socket. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_17.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..d19a98019cd146232c62dd7c1e452edd65df516e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_17.txt @@ -0,0 +1 @@ +The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_18.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b652ece8cfe4d5946fa13aeaa9e196267ba493b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_18.txt @@ -0,0 +1,19 @@ +shell script + export CCL_WORKER_COUNT=1 + export MASTER_ADDR=127.0.0.1 + mpirun -n 2 -genv OMP_NUM_THREADS=23 \ + python3 run_qa.py \ + --model_name_or_path google-bert/bert-large-uncased \ + --dataset_name squad \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir /tmp/debug_squad/ \ + --no_cuda \ + --ddp_backend ccl \ + --use_ipex +The following command enables training with a total of four processes on two Xeons (node0 and node1, taking node0 as the main process), ppn (processes per node) is set to 2, with one process running per one socket. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_19.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..d19a98019cd146232c62dd7c1e452edd65df516e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_19.txt @@ -0,0 +1 @@ +The variables OMP_NUM_THREADS/CCL_WORKER_COUNT can be tuned for optimal performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_2.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..acf3c2438c468f0a801cc772812589d1862416c7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_2.txt @@ -0,0 +1,2 @@ +Intel® oneCCL Bindings for PyTorch +Intel® oneCCL (collective communications library) is a library for efficient distributed deep learning training implementing such collectives like allreduce, allgather, alltoall. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_20.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..53e300bd3b3149ea634acaa807f048f86d15041d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_20.txt @@ -0,0 +1 @@ +In node0, you need to create a configuration file which contains the IP addresses of each node (for example hostfile) and pass that configuration file path as an argument. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_21.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f018ef0751fdab11e14166c7fa553e9120d9e3b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_21.txt @@ -0,0 +1,28 @@ +shell script + cat hostfile + xxx.xxx.xxx.xxx #node0 ip + xxx.xxx.xxx.xxx #node1 ip +Now, run the following command in node0 and 4DDP will be enabled in node0 and node1 with BF16 auto mixed precision: +shell script + export CCL_WORKER_COUNT=1 + export MASTER_ADDR=xxx.xxx.xxx.xxx #node0 ip + mpirun -f hostfile -n 4 -ppn 2 \ + -genv OMP_NUM_THREADS=23 \ + python3 run_qa.py \ + --model_name_or_path google-bert/bert-large-uncased \ + --dataset_name squad \ + --do_train \ + --do_eval \ + --per_device_train_batch_size 12 \ + --learning_rate 3e-5 \ + --num_train_epochs 2 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir /tmp/debug_squad/ \ + --no_cuda \ + --ddp_backend ccl \ + --use_ipex \ + --bf16 +Usage with Kubernetes +The same distributed training job from the previous section can be deployed to a Kubernetes cluster using the +Kubeflow PyTorchJob training operator. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_22.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..21816c1d820b0b6cc3e013f3f3ca80c426e3e538 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_22.txt @@ -0,0 +1,6 @@ +Setup +This example assumes that you have: +* Access to a Kubernetes cluster with Kubeflow installed +* kubectl installed and configured to access the Kubernetes cluster +* A Persistent Volume Claim (PVC) that can be used + to store datasets and model files. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_23.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..393c6aa906cd0f538433d1eb29122bd892907d59 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_23.txt @@ -0,0 +1,2 @@ +There are multiple options for setting up the PVC including using an NFS + storage class or a cloud storage bucket. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_24.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed49155ef07781568135f918dade2c828a593c95 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_24.txt @@ -0,0 +1 @@ +* A Docker container that includes your model training script and all the dependencies needed to run the script. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_25.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e79f8d1f1adc8811fa15133127ffabf1aded002 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_25.txt @@ -0,0 +1,3 @@ +For + distributed CPU training jobs, this typically includes PyTorch, Transformers, Intel Extension for PyTorch, Intel + oneCCL Bindings for PyTorch, and OpenSSH to communicate between the containers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_26.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1ff39ffcd71ccc774399a532ac7a3273b2922b8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_26.txt @@ -0,0 +1,12 @@ +The snippet below is an example of a Dockerfile that uses a base image that supports distributed CPU training and then +extracts a Transformers release to the /workspace directory, so that the example scripts are included in the image: +```dockerfile +FROM intel/ai-workflows:torch-2.0.1-huggingface-multinode-py3.9 +WORKDIR /workspace +Download and extract the transformers code +ARG HF_TRANSFORMERS_VER="4.35.2" +RUN mkdir transformers && \ + curl -sSL --retry 5 https://github.com/huggingface/transformers/archive/refs/tags/v${HF_TRANSFORMERS_VER}.tar.gz | tar -C transformers --strip-components=1 -xzf - + +The image needs to be built and copied to the cluster's nodes or pushed to a container registry prior to deploying the +PyTorchJob to the cluster. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_27.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1411c9784863c8370ea0fbed771c2ae21aa80a0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_27.txt @@ -0,0 +1,3 @@ +PyTorchJob Specification File +The Kubeflow PyTorchJob is used to run the distributed +training job on the cluster. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_28.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d6a5bd30d2255773083f7dfc87acac505afbb92 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_28.txt @@ -0,0 +1,9 @@ +The yaml file for the PyTorchJob defines parameters such as: + * The name of the PyTorchJob + * The number of replicas (workers) + * The python script and it's parameters that will be used to run the training job + * The types of resources (node selector, memory, and CPU) needed for each worker + * The image/tag for the Docker container to use + * Environment variables + * A volume mount for the PVC +The volume mount defines a path where the PVC will be mounted in the container for each worker pod. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_29.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e88254897272896ae0150e22a9d69f3d6134a00 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_29.txt @@ -0,0 +1,2 @@ +This location can be +used for the dataset, checkpoint files, and the saved model after training completes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_3.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3414b2257e6df33bee0f32378454b4ba9049e30 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_3.txt @@ -0,0 +1 @@ +For more information on oneCCL, please refer to the oneCCL documentation and oneCCL specification. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_30.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a43cc5a526929fd94c5f49eef832c4127243e8d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_30.txt @@ -0,0 +1,2 @@ +The snippet below is an example of a yaml file for a PyTorchJob with 4 workers running the +question-answering example. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_31.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..bbf812415ca008eaa3f49c177cc724d4185ff8ce --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_31.txt @@ -0,0 +1,84 @@ +yaml +apiVersion: "kubeflow.org/v1" +kind: PyTorchJob +metadata: + name: transformers-pytorchjob + namespace: kubeflow +spec: + elasticPolicy: + rdzvBackend: c10d + minReplicas: 1 + maxReplicas: 4 + maxRestarts: 10 + pytorchReplicaSpecs: + Worker: + replicas: 4 # The number of worker pods + restartPolicy: OnFailure + template: + spec: + containers: + - name: pytorch + image: : # Specify the docker image to use for the worker pods + imagePullPolicy: IfNotPresent + command: + - torchrun + - /workspace/transformers/examples/pytorch/question-answering/run_qa.py + - --model_name_or_path + - "google-bert/bert-large-uncased" + - --dataset_name + - "squad" + - --do_train + - --do_eval + - --per_device_train_batch_size + - "12" + - --learning_rate + - "3e-5" + - --num_train_epochs + - "2" + - --max_seq_length + - "384" + - --doc_stride + - "128" + - --output_dir + - "/tmp/pvc-mount/output" + - --no_cuda + - --ddp_backend + - "ccl" + - --use_ipex + - --bf16 # Specify --bf16 if your hardware supports bfloat16 + env: + - name: LD_PRELOAD + value: "/usr/lib/x86_64-linux-gnu/libtcmalloc.so.4.5.9:/usr/local/lib/libiomp5.so" + - name: TRANSFORMERS_CACHE + value: "/tmp/pvc-mount/transformers_cache" + - name: HF_DATASETS_CACHE + value: "/tmp/pvc-mount/hf_datasets_cache" + - name: LOGLEVEL + value: "INFO" + - name: CCL_WORKER_COUNT + value: "1" + - name: OMP_NUM_THREADS # Can be tuned for optimal performance + + resources: + limits: + cpu: 200 # Update the CPU and memory limit values based on your nodes + memory: 128Gi + requests: + cpu: 200 # Update the CPU and memory request values based on your nodes + memory: 128Gi + volumeMounts: + - name: pvc-volume + mountPath: /tmp/pvc-mount + - mountPath: /dev/shm + name: dshm + restartPolicy: Never + nodeSelector: # Optionally use the node selector to specify what types of nodes to use for the workers + node-type: spr + volumes: + - name: pvc-volume + persistentVolumeClaim: + claimName: transformers-pvc + - name: dshm + emptyDir: + medium: Memory +To run this example, update the yaml based on your training script and the nodes in your cluster. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_32.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d8351015521342c75d2f1a328ae0cd54a15d0f1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_32.txt @@ -0,0 +1,3 @@ +The CPU resource limits/requests in the yaml are defined in cpu units +where 1 CPU unit is equivalent to 1 physical CPU core or 1 virtual core (depending on whether the node is a physical +host or a VM). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_33.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..f2658e6080cfff74fe5b8b1646fc58184edba278 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_33.txt @@ -0,0 +1,2 @@ +The amount of CPU and memory limits/requests defined in the yaml should be less than the amount of +available CPU/memory capacity on a single machine. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_34.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..082ee4bbdacd001a7d3fa8ddd2838362e2e10290 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_34.txt @@ -0,0 +1,2 @@ +It is usually a good idea to not use the entire machine's capacity in +order to leave some resources for the kubelet and OS. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_35.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..4583b077346e94e33a725e5eef2b6884963868b4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_35.txt @@ -0,0 +1,3 @@ +In order to get "guaranteed" +quality of service for the worker pods, +set the same CPU and memory amounts for both the resource limits and requests. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_36.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..62a5c6ad25e9b461b71457f8b5b4dac45f555828 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_36.txt @@ -0,0 +1,6 @@ +Deploy +After the PyTorchJob spec has been updated with values appropriate for your cluster and training job, it can be deployed +to the cluster using: + +kubectl create -f pytorchjob.yaml +The kubectl get pods -n kubeflow command can then be used to list the pods in the kubeflow namespace. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_37.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..1d9165035645bb459e9d359a39f62654e8f288f8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_37.txt @@ -0,0 +1,2 @@ +You should see +the worker pods for the PyTorchJob that was just deployed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_38.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..04e7f5b14aa79d630ede11229192ba54af564a63 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_38.txt @@ -0,0 +1,2 @@ +At first, they will probably have a status of "Pending" as +the containers get pulled and created, then the status should change to "Running". \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_39.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..14a3dc0966b77c8fbc00c424d6da93efbeb28d4a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_39.txt @@ -0,0 +1,8 @@ +NAME READY STATUS RESTARTS AGE + +transformers-pytorchjob-worker-0 1/1 Running 0 7m37s +transformers-pytorchjob-worker-1 1/1 Running 0 7m37s +transformers-pytorchjob-worker-2 1/1 Running 0 7m37s +transformers-pytorchjob-worker-3 1/1 Running 0 7m37s + +The logs for worker can be viewed using kubectl logs -n kubeflow . \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_4.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..77b088f0aa55fe8f04246860d38fad26f7f3ccba --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_4.txt @@ -0,0 +1,2 @@ +Module oneccl_bindings_for_pytorch (torch_ccl before version 1.12) implements PyTorch C10D ProcessGroup API and can be dynamically loaded as external ProcessGroup and only works on Linux platform now +Check more detailed information for oneccl_bind_pt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_40.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfa73f43b017248eefec70941e63448151e5ff7d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_40.txt @@ -0,0 +1,4 @@ +Add -f to stream the logs, for example: + +kubectl logs -n kubeflow transformers-pytorchjob-worker-0 -f +After the training job completes, the trained model can be copied from the PVC or storage location. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_41.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..c10f18dfa7fd55af93e5327f7750e4c03fd1fe74 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_41.txt @@ -0,0 +1,2 @@ +When you are done +with the job, the PyTorchJob resource can be deleted from the cluster using kubectl delete -f pytorchjob.yaml. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_42.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..460c5186d64b525a8e4c877594242c2d9599ec47 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_42.txt @@ -0,0 +1,3 @@ +Summary +This guide covered running distributed PyTorch training jobs using multiple CPUs on bare metal and on a Kubernetes +cluster. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_43.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..c200dd4285edcc01f94f49002d928ce3d301589c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_43.txt @@ -0,0 +1,2 @@ +Both cases utilize Intel Extension for PyTorch and Intel oneCCL Bindings for PyTorch for optimal training +performance, and can be used as a template to run your own workload on multiple nodes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_5.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f56c58b7d2708f61e273317dd1afd506e842a7b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_5.txt @@ -0,0 +1,10 @@ +Intel® oneCCL Bindings for PyTorch installation +Wheel files are available for the following Python versions: +| Extension Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 | Python 3.10 | +| :---------------: | :--------: | :--------: | :--------: | :--------: | :---------: | +| 2.1.0 | | √ | √ | √ | √ | +| 2.0.0 | | √ | √ | √ | √ | +| 1.13.0 | | √ | √ | √ | √ | +| 1.12.100 | | √ | √ | √ | √ | +| 1.12.0 | | √ | √ | √ | √ | +Please run pip list | grep torch to get your pytorch_version. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_6.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..e286633aa6cb1f96a344023fd09dc175fe425513 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_6.txt @@ -0,0 +1,2 @@ +pip install oneccl_bind_pt=={pytorch_version} -f https://developer.intel.com/ipex-whl-stable-cpu +where {pytorch_version} should be your PyTorch version, for instance 2.1.0. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_7.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..01d201f74689f4c1285a2497f7093eb064734790 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_7.txt @@ -0,0 +1 @@ +Check more approaches for oneccl_bind_pt installation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_8.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..94f58e6e7625322925585a5ba2d77108ac94bd7d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_8.txt @@ -0,0 +1 @@ +Versions of oneCCL and PyTorch must match. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_cpu_many/chunk_9.txt b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..76f6ee3355973a38e0437f48896a8562393bf5af --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_cpu_many/chunk_9.txt @@ -0,0 +1,5 @@ +oneccl_bindings_for_pytorch 1.12.0 prebuilt wheel does not work with PyTorch 1.12.1 (it is for PyTorch 1.12.0) +PyTorch 1.12.1 should work with oneccl_bindings_for_pytorch 1.12.100 + +Intel® MPI library +Use this standards-based MPI implementation to deliver flexible, efficient, scalable cluster messaging on Intel® architecture. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_0.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb79e4f414a743cfd021b4ded3be283b6580c887 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_0.txt @@ -0,0 +1,4 @@ + +Efficient Training on Multiple GPUs +If training a model on a single GPU is too slow or if the model's weights do not fit in a single GPU's memory, transitioning +to a multi-GPU setup may be a viable option. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_1.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..af75877a937e69088dc7d57816c2886e0c5296d3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_1.txt @@ -0,0 +1,3 @@ +Prior to making this transition, thoroughly explore all the strategies covered +in the Methods and tools for efficient training on a single GPU as they are universally applicable +to model training on any number of GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_10.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..9ab0de60082fbbd26a10dc465e104bf579dd2870 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_10.txt @@ -0,0 +1,2 @@ +Scalability strategy +Begin by estimating how much vRAM is required to train your model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_100.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..632391349addb332cd549bca3210300687860cdd --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_100.txt @@ -0,0 +1,2 @@ +You can see that there's a forward path of 4 pipe stages (F0, F1, F2 and F3) followed by +a backward path in reverse order (B3, B2, B1, and B0). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_101.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..6e049ee099ff0bc3c5b85e4b0be5f6de66c19c88 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_101.txt @@ -0,0 +1,2 @@ +PP introduces a new hyperparameter to tune - chunks, which determines how many data chunks are sent in a sequence +through the same pipe stage. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_102.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f7a1b9ac01c05ba042344797be616260d4ebc89 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_102.txt @@ -0,0 +1 @@ +For example, in the bottom diagram you can see chunks=4. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_103.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..b45dcf391a4b27db480f0c4c96a3fd67d0d064fa --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_103.txt @@ -0,0 +1,2 @@ +GPU0 performs the same +forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do complete their work. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_104.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..643ab0b56c37c82fcc6f38865f4721f985969bbb --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_104.txt @@ -0,0 +1,2 @@ +Only when the other GPUs begin to complete their work, GPU0 starts to work again doing the backward path for chunks +3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_105.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..18434bc77ae2d45b41b09892a6beb0d61c1e6cb2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_105.txt @@ -0,0 +1 @@ +Note that this is the same concept as gradient accumulation steps. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_106.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..2d433844b6f9e431b4215658d03672f9808b2dbd --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_106.txt @@ -0,0 +1,2 @@ +PyTorch uses chunks, while DeepSpeed refers +to the same hyperparameter as gradient accumulation steps. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_107.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f81073e42b99c26687c812a2ea0f880dc0acc0a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_107.txt @@ -0,0 +1 @@ +Because of the chunks, PP introduces the notion of micro-batches (MBS). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_108.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e96da1a5ffad27bd50d46e6c19abbebd089dcc6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_108.txt @@ -0,0 +1,3 @@ +DP splits the global data batch size into +mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of +256 each (1024/4). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_109.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..b2178c0a46c680531279137885053663a54dadeb --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_109.txt @@ -0,0 +1 @@ +And if the number of chunks (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_11.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e312dde7cbf5d8cff8f4844af40ed7467491ccd --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_11.txt @@ -0,0 +1,3 @@ +For models hosted on the 🤗 Hub, use our +Model Memory Calculator, which gives you +accurate calculations within a few percent margin. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_110.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d3c9f784d4f23819efde615fb648ba3e8407908 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_110.txt @@ -0,0 +1,2 @@ +Each +Pipeline stage works with a single micro-batch at a time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_111.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c8f0791738474c1b03c4052f6a79132657a0e2d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_111.txt @@ -0,0 +1,2 @@ +To calculate the global batch size of the DP + PP setup, +use the formula: mbs * chunks * dp_degree (8 * 32 * 4 = 1024). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_112.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f3325f2a04e7fdd5af144c742a7c44c7ffdebed --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_112.txt @@ -0,0 +1 @@ +With chunks=1 you end up with the naive MP, which is inefficient. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_113.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f9068284fe66ab86e4af2a96d765e955a80504a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_113.txt @@ -0,0 +1,2 @@ +With a large chunks value you end up with +tiny micro-batch sizes which is also inefficient. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_114.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..11679431703dc7e2c1c4b2733230573a7e5336af --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_114.txt @@ -0,0 +1,2 @@ +For this reason, we encourage to experiment with the chunks value to +find the one that leads to the most efficient GPUs utilization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_115.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..af86a3896fef55caf26cd261d22723078b60a661 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_115.txt @@ -0,0 +1,2 @@ +You may notice a bubble of "dead" time on the diagram that can't be parallelized because the last forward stage +has to wait for backward to complete the pipeline. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_116.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..6492a09bdd5d1eaf540b351b5ff0bc62f47a296e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_116.txt @@ -0,0 +1,2 @@ +The purpose of finding the best value for chunks is to enable a high +concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_117.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..98fa3e6fae1c85f6048888f727e823dae9b6e56d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_117.txt @@ -0,0 +1,6 @@ +Pipeline API solutions have been implemented in: +- PyTorch +- DeepSpeed +- Megatron-LM +These come with some shortcomings: +- They have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a nn.Sequential sequence of the same, which may require changes to the design of the model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_118.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..8c48cfd845526ba2a53414d7a68c909f2a60941e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_118.txt @@ -0,0 +1 @@ +- Currently the Pipeline API is very restricted. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_119.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..237386af3d6b3e38429bff353a6b135347812415 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_119.txt @@ -0,0 +1 @@ +If you had a bunch of Python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_12.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b49648e74073e42389d06a90800016af7b5cc16 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_12.txt @@ -0,0 +1,3 @@ +Parallelization strategy for a single Node / multi-GPU setup +When training a model on a single node with multiple GPUs, your choice of parallelization strategy can significantly +impact performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_120.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..87549034aa819859e4ca36dac51c01be08e729eb --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_120.txt @@ -0,0 +1 @@ +Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_121.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..fab3a59ddcfd85fb17fd84588fbd833cf40dadf9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_121.txt @@ -0,0 +1 @@ +These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_122.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..45c96edaebcb888ad92a004a0de248cb06756900 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_122.txt @@ -0,0 +1,2 @@ +Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693 +- Conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_123.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..d9b5d9a3898df9009e893a70a876eb5998d9d077 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_123.txt @@ -0,0 +1 @@ +- They have to arrange each layer so that the output of one layer becomes an input to the other layer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_124.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..b7b7bcb6042d17c58933189a3c90f187b8c3e0a5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_124.txt @@ -0,0 +1,5 @@ +More recent solutions include: +- Varuna +- Sagemaker +We have not experimented with Varuna and SageMaker but their papers report that they have overcome the list of problems +mentioned above and that they require smaller changes to the user's model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_125.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..87abaa3b836ea24201f66df2261d21a043d296ca --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_125.txt @@ -0,0 +1,2 @@ +Implementations: +- PyTorch (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_126.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..268446418d2df8bb51f66ed8b96281e876937bea --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_126.txt @@ -0,0 +1,3 @@ +Some examples +- DeepSpeed +- Megatron-LM has an internal implementation - no API. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_127.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..48667b5cc74d548463c22a4663f1df9a0dfc646e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_127.txt @@ -0,0 +1,2 @@ +- Varuna +- SageMaker - this is a proprietary solution that can only be used on AWS. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_128.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..24e24cdc667e9713c1b8b0984bbe82c937268b9c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_128.txt @@ -0,0 +1 @@ +- OSLO - this is implemented based on the Hugging Face Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_129.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..28de7b60e544f56990b20c540da42bce75a58a4e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_129.txt @@ -0,0 +1 @@ +🤗 Transformers status: as of this writing none of the models supports full-PP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_13.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..0479bfde239c218224f68019cc243db4e660f290 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_13.txt @@ -0,0 +1,6 @@ +Here's a breakdown of your options: +Case 1: Your model fits onto a single GPU +If your model can comfortably fit onto a single GPU, you have two primary options: + +DDP - Distributed DataParallel +ZeRO - depending on the situation and configuration used, this method may or may not be faster, however, it's worth experimenting with it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_130.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..f345aafacdb19466c3d6cd36f644835766c18637 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_130.txt @@ -0,0 +1 @@ +GPT2 and T5 models have naive MP support. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_131.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..f0a443f66fea28aef5b2710d07111c31e3fb2093 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_131.txt @@ -0,0 +1 @@ +The main obstacle is being unable to convert the models to nn.Sequential and have all the inputs to be Tensors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_132.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..717e091b151e461e52e3f0f1edfce49bfa858fd6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_132.txt @@ -0,0 +1,2 @@ +This +is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_133.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..0effbc6afe2b3f92b1d3b019d4f54fb81300c7c4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_133.txt @@ -0,0 +1,5 @@ +DeepSpeed and Megatron-LM integrations are available in 🤗 Accelerate +Other approaches: +DeepSpeed, Varuna and SageMaker use the concept of an Interleaved Pipeline + +Here the bubble (idle time) is further minimized by prioritizing backward passes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_134.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..3219acec8d198305fd92d49894fafa6d9f334867 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_134.txt @@ -0,0 +1,2 @@ +Varuna further attempts to improve the +schedule by using simulations to discover the most efficient scheduling. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_135.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a16ca543532b19ef491904c6b54683538477c51 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_135.txt @@ -0,0 +1 @@ +OSLO has pipeline parallelism implementation based on the Transformers without nn.Sequential conversion. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_136.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..9168173b8d3f8e81c1e0ac569e383aa64cafdd7c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_136.txt @@ -0,0 +1,2 @@ +Tensor Parallelism +In Tensor Parallelism, each GPU processes a slice of a tensor and only aggregates the full tensor for operations requiring it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_137.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6290ed2852dff0d4b3cb05c72fffc1946100ead --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_137.txt @@ -0,0 +1,2 @@ +To describe this method, this section of the guide relies on the concepts and diagrams from the Megatron-LM +paper: Efficient Large-Scale Language Model Training on GPU Clusters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_138.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3cdcb2982f99452c82b8807e8e2631d677a0391 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_138.txt @@ -0,0 +1 @@ +The main building block of any transformer is a fully connected nn.Linear followed by a nonlinear activation GeLU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_139.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..0816a8872d2ed9d831a51ab1b02b784c600359f5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_139.txt @@ -0,0 +1,2 @@ +The dot dot-product part of it, following the Megatron's paper notation, can be written as Y = GeLU(XA), where X is +an input vector, Y is the output vector, and A is the weight matrix. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_14.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..cca241dc19afcf0ba9f7388cc6fa0be74936c79f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_14.txt @@ -0,0 +1,9 @@ +Case 2: Your model doesn't fit onto a single GPU: +If your model is too large for a single GPU, you have several alternatives to consider: + +PipelineParallel (PP) +ZeRO +TensorParallel (TP) + +With very fast inter-node connectivity (e.g., NVLINK or NVSwitch) all three strategies (PP, ZeRO, TP) should result in +similar performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_140.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed2baa6ee9f2a7f785eb523019dcf4b292653bca --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_140.txt @@ -0,0 +1,7 @@ +If we look at the computation in matrix form, you can see how the matrix multiplication can be split between multiple GPUs: + +If we split the weight matrix A column-wise across N GPUs and perform matrix multiplications XA_1 through XA_n in parallel, +then we will end up with N output vectors Y_1, Y_2, , Y_n which can be fed into GeLU independently: + +Using this principle, we can update a multi-layer perceptron of arbitrary depth, without the need for any synchronization +between GPUs until the very end, where we need to reconstruct the output vector from shards. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_141.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..48b20d105b95b804c2fc7a1b7bdd1e3cced6edd1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_141.txt @@ -0,0 +1,5 @@ +The Megatron-LM paper authors +provide a helpful illustration for that: + +Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having +multiple independent heads! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_142.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5a4dc5d7022de1395bcc3b7e60a9357b2b9d4f0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_142.txt @@ -0,0 +1 @@ +Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_143.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..00684e31a0761f525844f16a184a40ad99ca639a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_143.txt @@ -0,0 +1 @@ +Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_144.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ae23c6a4ef6da23564012a54e08646c03955cb0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_144.txt @@ -0,0 +1,2 @@ +If you need a TP degree of 8, you need to use +nodes that have at least 8 GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_145.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..a68041cfae0ecbdf5b2687dfa6067c85d2dedae2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_145.txt @@ -0,0 +1 @@ +This section is based on the original much more detailed TP overview. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_146.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..c301387e8addca430bcbb5e4c84a3028556f5644 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_146.txt @@ -0,0 +1 @@ +by @anton-l. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_147.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..39378518d2575920280020fba079df0de5ebf29c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_147.txt @@ -0,0 +1,6 @@ +Alternative names: +- DeepSpeed calls it tensor slicing +Implementations: +- Megatron-LM has an internal implementation, as it's very model-specific +- parallelformers (only inference at the moment) +- SageMaker - this is a proprietary solution that can only be used on AWS. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_148.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..290dde3e114f00a00bdef5b5ff2cb3dccbf9c945 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_148.txt @@ -0,0 +1 @@ +- OSLO has the tensor parallelism implementation based on the Transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_149.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd9980cd0b0dd05f22f5d7e37f8e1288ce7b6a7c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_149.txt @@ -0,0 +1 @@ +SageMaker combines TP with DP for a more efficient processing. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_15.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..89f6a7a079f5f0aa77b0dfa4797038a6a699c11f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_15.txt @@ -0,0 +1 @@ +However, without these, PP will be faster than TP or ZeRO. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_150.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..ba4a5fc916e98bd6acccf4327580a5ddacef9edc --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_150.txt @@ -0,0 +1,3 @@ +🤗 Transformers status: +- core: not yet implemented in the core +- but if you want inference parallelformers provides this support for most of our models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_151.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..6214910f0e1de43f782536ca8785b33fdd160153 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_151.txt @@ -0,0 +1 @@ +So until this is implemented in the core you can use theirs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_152.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe1f1d28d74848bd619e3f546cafbb6249669430 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_152.txt @@ -0,0 +1 @@ +And hopefully training mode will be supported too. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_153.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..716d54c784505e985ac281179742423b968c3a6e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_153.txt @@ -0,0 +1,2 @@ +- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more here +🤗 Accelerate integrates with TP from Megatron-LM. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_154.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..c851881b7417109acb8e906e60c65367bf7ac87a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_154.txt @@ -0,0 +1,3 @@ +Data Parallelism + Pipeline Parallelism +The following diagram from the DeepSpeed pipeline tutorial demonstrates +how one can combine DP with PP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_155.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..16a3242333074e26db2420a23a3ec5e994474e77 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_155.txt @@ -0,0 +1 @@ +Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_156.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cf5bfde9d0e0f61648a770cdefb7352b8573e4f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_156.txt @@ -0,0 +1,2 @@ +To DP there is just GPUs 0 +and 1 where it feeds data as if there were just 2 GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_157.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..2326b66024593e96db8dc7c9b085a6377e9038e3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_157.txt @@ -0,0 +1 @@ +GPU0 "secretly" offloads some of its load to GPU2 using PP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_158.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ee2bdb4fe818cd027cb3e77e83d65e280155df3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_158.txt @@ -0,0 +1 @@ +And GPU1 does the same by enlisting GPU3 to its aid. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_159.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..8ee634bea2b3fdca4a3474e7f14cd8611c34a8d2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_159.txt @@ -0,0 +1 @@ +Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_16.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..2f0fdde3317b07f9aa6b6853215892c895f02b7e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_16.txt @@ -0,0 +1,2 @@ +The degree of TP may also +make a difference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_160.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..b77c98a8c68efe40396445d54c6810c3e05a3cf0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_160.txt @@ -0,0 +1,9 @@ +Implementations: +- DeepSpeed +- Megatron-LM +- Varuna +- SageMaker +- OSLO +🤗 Transformers status: not yet implemented +Data Parallelism + Pipeline Parallelism + Tensor Parallelism +To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_161.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..19a9ef28c29da4c15b73c587d85421bb79b4075f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_161.txt @@ -0,0 +1 @@ +This can be seen in the following diagram. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_162.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..d80479659b326b8f07f14b0d67d373ee1340061e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_162.txt @@ -0,0 +1 @@ +This diagram is from a blog post 3D parallelism: Scaling to trillion-parameter models, which is a good read as well. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_163.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..3bbcedf99f4ea58f06ae435cd779971ea4934f9b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_163.txt @@ -0,0 +1 @@ +Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_164.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ae537cb3e1d990185cecc6388b303fb7002be60 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_164.txt @@ -0,0 +1,2 @@ +Implementations: +- DeepSpeed - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_165.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ce516abe9b05be64e30586cb4934c4b9ed56fef --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_165.txt @@ -0,0 +1,5 @@ +- Megatron-LM +- Varuna +- SageMaker +- OSLO +🤗 Transformers status: not yet implemented, since we have no PP and TP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_166.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..b90be114fe15daa477d1671af666dcdbcbf17217 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_166.txt @@ -0,0 +1,2 @@ +ZeRO Data Parallelism + Pipeline Parallelism + Tensor Parallelism +One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_167.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..583b3fdb67369d816ad4012f07305fae122359b5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_167.txt @@ -0,0 +1,2 @@ +It has already been +discussed in ZeRO Data Parallelism. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_168.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..36ad05591f1ba559ea04940032f2e456fe53fea8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_168.txt @@ -0,0 +1 @@ +Normally it's a standalone feature that doesn't require PP or TP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_169.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b6c6a129fb1a8552e0cb12e838aa662bae3207d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_169.txt @@ -0,0 +1 @@ +But it can be combined with PP and TP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_17.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..2650bb5f120fd933ab31d585b068fa395c92cb71 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_17.txt @@ -0,0 +1 @@ +It's best to experiment with your specific setup to determine the most suitable strategy. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_170.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..31e5e900c1e38129bd60247e5fcb8de2d20f0c86 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_170.txt @@ -0,0 +1 @@ +When ZeRO-DP is combined with PP (and optionally TP) it typically enables only ZeRO stage 1 (optimizer sharding). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_171.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..634bc8295b23410e5bb8ba27a4f36bd4f8a559af --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_171.txt @@ -0,0 +1,2 @@ +While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have negative +performance impacts. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_172.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc4d366710eba949acf88d7b205ac68dcc972ded --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_172.txt @@ -0,0 +1,2 @@ +There would need to be an additional reduce-scatter collective for every micro-batch to aggregate +the gradients before sharding, which adds a potentially significant communication overhead. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_173.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..3249d1e5cde5ccada02ea1d5f304bef1029e9a27 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_173.txt @@ -0,0 +1,3 @@ +By nature of Pipeline Parallelism, +small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with +minimizing the Pipeline bubble (number of micro-batches). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_174.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..711d3d7ee9d4253b8b2e473094dcef2c271db5a8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_174.txt @@ -0,0 +1 @@ +Therefore those communication costs are going to impact the performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_175.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe00b5f3eb65d235b36832efb478bfa9c6a880a3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_175.txt @@ -0,0 +1 @@ +In addition, there are already fewer layers than normal due to PP and so the memory savings won't be huge. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_176.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_176.txt new file mode 100644 index 0000000000000000000000000000000000000000..318133e9f334f2e42dc362a2c92bd22a8de55bd8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_176.txt @@ -0,0 +1,2 @@ +PP already +reduces gradient size by 1/PP, and so gradient sharding savings on top of that are less significant than pure DP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_177.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_177.txt new file mode 100644 index 0000000000000000000000000000000000000000..552b3526ee8f091dc5b9120a06fe4275d08a4c77 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_177.txt @@ -0,0 +1 @@ +ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_178.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_178.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a8bf4af46d5c47087277b716717d04725216bb8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_178.txt @@ -0,0 +1 @@ +And since we have ZeRO, the other benefit is ZeRO-Offload. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_179.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_179.txt new file mode 100644 index 0000000000000000000000000000000000000000..47f5c884335647c366747b99260de0a92fc2703d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_179.txt @@ -0,0 +1 @@ +Since this is stage 1 optimizer states can be offloaded to CPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_18.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..bcfab404168b7b3866ca78e9f66fc7f5165710fc --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_18.txt @@ -0,0 +1 @@ +TP is almost always used within a single node. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_180.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_180.txt new file mode 100644 index 0000000000000000000000000000000000000000..536bba68a6ab30290d50a720ea2f48f89d0dfb30 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_180.txt @@ -0,0 +1,2 @@ +Implementations: +- Megatron-DeepSpeed and Megatron-Deepspeed from BigScience, which is the fork of the former repo. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_181.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_181.txt new file mode 100644 index 0000000000000000000000000000000000000000..af5912b6e2cb9b0ca9c7d13f7fde224ec4d50452 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_181.txt @@ -0,0 +1,6 @@ +- OSLO +Important papers: + +Using DeepSpeed and Megatron to Train Megatron-Turing NLG 530B, A Large-Scale Generative Language Model + +🤗 Transformers status: not yet implemented, since we have no PP and TP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_182.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_182.txt new file mode 100644 index 0000000000000000000000000000000000000000..c604da69e14308dc44c42e473d018f6acfd8a436 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_182.txt @@ -0,0 +1,2 @@ +FlexFlow +FlexFlow also solves the parallelization problem in a slightly different approach. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_183.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_183.txt new file mode 100644 index 0000000000000000000000000000000000000000..78890b33ff60862e7dfa27c1cb35480ae010d70c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_183.txt @@ -0,0 +1,2 @@ +Paper: "Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao Jia, Matei Zaharia, Alex Aiken +It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_184.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_184.txt new file mode 100644 index 0000000000000000000000000000000000000000..0fa820b26d89107bc99704b5f373921431a98c7c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_184.txt @@ -0,0 +1,8 @@ +Sample = Data Parallelism (sample-wise parallel) +Operator = Parallelize a single operation into several sub-operations +Attribute = Data Parallelism (length-wise parallel) +Parameter = Model Parallelism (regardless of dimension - horizontal or vertical) + +Examples: +* Sample +Let's take 10 batches of sequence length 512. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_185.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_185.txt new file mode 100644 index 0000000000000000000000000000000000000000..818a57b91b41f931f9b12c68a643a76aaf587e86 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_185.txt @@ -0,0 +1 @@ +If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes be 5 x 2 x 512. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_186.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_186.txt new file mode 100644 index 0000000000000000000000000000000000000000..72ab1508871d94d5c54a9d33ebbb1b4b1314a61a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_186.txt @@ -0,0 +1,3 @@ +Operator + +If we perform layer normalization, we compute std first and mean second, and then we can normalize data. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_187.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_187.txt new file mode 100644 index 0000000000000000000000000000000000000000..2bbac54ec51b3f3ca5fa6d22325358a1dad7a7d6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_187.txt @@ -0,0 +1 @@ +Operator parallelism allows computing std and mean in parallel. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_188.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_188.txt new file mode 100644 index 0000000000000000000000000000000000000000..595d8344f22f3fbcf557d9bf1ce5883fd5e47c2b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_188.txt @@ -0,0 +1,2 @@ +So if we parallelize them by operator dimension into 2 +devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_189.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_189.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c1773b8542a532e38c031e47b89900d96666179 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_189.txt @@ -0,0 +1,3 @@ +Attribute + +We have 10 batches of 512 length. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_19.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a770043425c00d801dcef967f0e1d489bc898a2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_19.txt @@ -0,0 +1 @@ +That is TP size <= GPUs per node. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_190.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_190.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc6b473c0132523104d9c03e91961d7c43ed72f5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_190.txt @@ -0,0 +1 @@ +If we parallelize them by attribute dimension into 2 devices, 10 x 512 will be 10 x 2 x 256. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_191.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_191.txt new file mode 100644 index 0000000000000000000000000000000000000000..8f9d93d5fd098f96bffc2bb4da541b28f7f82697 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_191.txt @@ -0,0 +1,3 @@ +Parameter + +It is similar with tensor model parallelism or naive layer-wise model parallelism. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_192.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_192.txt new file mode 100644 index 0000000000000000000000000000000000000000..9cae43648cf09e05b7ab8b22e169a3ba078c7fba --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_192.txt @@ -0,0 +1,3 @@ +The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) +fast-intra-connect/slow-inter-connect and it automatically optimizes all these algorithmically deciding which +parallelisation to use where. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_193.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_193.txt new file mode 100644 index 0000000000000000000000000000000000000000..ebaaa7d1a5b9e407883479169967cbcd91cc5b6b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_193.txt @@ -0,0 +1,2 @@ +One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and +fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_194.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_194.txt new file mode 100644 index 0000000000000000000000000000000000000000..060b2cc49d3ee81da0519d4ad938e14a866bec58 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_194.txt @@ -0,0 +1,2 @@ +So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best +strategy to utilise this specific environment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_195.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_195.txt new file mode 100644 index 0000000000000000000000000000000000000000..a213662588881b4cc2891eabd98cb2af500648b0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_195.txt @@ -0,0 +1,2 @@ +If you add/remove/replace any parts it'll run and re-optimize the plan +for that. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_196.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_196.txt new file mode 100644 index 0000000000000000000000000000000000000000..0af14339c07c35aac4d79e482e1fa143e3d35285 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_196.txt @@ -0,0 +1 @@ +And then you can train. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_197.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_197.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3f10690036cd1297535915138ac9b84db702a32 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_197.txt @@ -0,0 +1 @@ +A different setup will have its own custom optimization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_198.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_198.txt new file mode 100644 index 0000000000000000000000000000000000000000..db4bbc2cc1690c752f6044197668ec9127def098 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_198.txt @@ -0,0 +1,2 @@ +🤗 Transformers status: Transformers models are FX-trace-able via transformers.utils.fx, +which is a prerequisite for FlexFlow, however, changes are required on the FlexFlow side to make it work with Transformers models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_199.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_199.txt new file mode 100644 index 0000000000000000000000000000000000000000..e546c8f424202b1f79fee88e92b5d8f72a77a4e5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_199.txt @@ -0,0 +1,2 @@ +GPU selection +When training on multiple GPUs, you can specify the number of GPUs to use and in what order. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_2.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbf3984947cf0454d13c919ade4ab3b33a22ab6f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_2.txt @@ -0,0 +1,2 @@ +Once you have employed those strategies and found them insufficient for your +case on a single GPU, consider moving to multiple GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_20.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef38fb62b19cac37c6411db229fb0d30c7c980dd --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_20.txt @@ -0,0 +1,3 @@ +Case 3: Largest layer of your model does not fit onto a single GPU + +If you are not using ZeRO, you have to use TensorParallel (TP), because PipelineParallel (PP) alone won't be sufficient to accommodate the large layer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_200.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_200.txt new file mode 100644 index 0000000000000000000000000000000000000000..01c0e6977e1bb40efba9827ab9328d54a70a749b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_200.txt @@ -0,0 +1 @@ +This can be useful for instance when you have GPUs with different computing power and want to use the faster GPU first. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_201.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_201.txt new file mode 100644 index 0000000000000000000000000000000000000000..b44ef19255ce3b459c01e0f9be1d0c20f29e0ef3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_201.txt @@ -0,0 +1 @@ +The selection process works for both DistributedDataParallel and DataParallel to use only a subset of the available GPUs, and you don't need Accelerate or the DeepSpeed integration. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_202.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_202.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dfe331fa47c8f211c4a2812f8dc087d5ebf3331 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_202.txt @@ -0,0 +1,4 @@ +Number of GPUs +For example, if you have 4 GPUs and you only want to use the first 2: + +Use the --nproc_per_node to select how many GPUs to use. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_203.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_203.txt new file mode 100644 index 0000000000000000000000000000000000000000..8621c21cc8f2fce87181b2108c4fb58c68132c81 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_203.txt @@ -0,0 +1,3 @@ +torchrun --nproc_per_node=2 trainer-program.py + +Use --num_processes to select how many GPUs to use. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_204.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_204.txt new file mode 100644 index 0000000000000000000000000000000000000000..68c8ee3ab792c5cd487ea4733abd24aa4be9747b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_204.txt @@ -0,0 +1,3 @@ +accelerate launch --num_processes 2 trainer-program.py + +Use --num_gpus to select how many GPUs to use. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_205.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_205.txt new file mode 100644 index 0000000000000000000000000000000000000000..07fd34146385abecf1faab2fe0047c7780d05d1c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_205.txt @@ -0,0 +1,4 @@ +deepspeed --num_gpus 2 trainer-program.py + +Order of GPUs +Now, to select which GPUs to use and their order, you'll use the CUDA_VISIBLE_DEVICES environment variable. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_206.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_206.txt new file mode 100644 index 0000000000000000000000000000000000000000..a873c4e2dcbfd84ce33abbf939a1203659312100 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_206.txt @@ -0,0 +1 @@ +It is easiest to set the environment variable in a ~/bashrc or another startup config file. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_207.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_207.txt new file mode 100644 index 0000000000000000000000000000000000000000..d82158748a34d4db7a2e17ca50832ffad4463f4d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_207.txt @@ -0,0 +1 @@ +CUDA_VISIBLE_DEVICES is used to map which GPUs are used. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_208.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_208.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d3d27269994f62a0ad7dc3cd94246101663d527 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_208.txt @@ -0,0 +1,4 @@ +For example, if you have 4 GPUs (0, 1, 2, 3) and you only want to run GPUs 0 and 2: + +CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py +Only the 2 physical GPUs (0 and 2) are "visible" to PyTorch and these are mapped to cuda:0 and cuda:1 respectively. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_209.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_209.txt new file mode 100644 index 0000000000000000000000000000000000000000..36b2a49e266f3f6bc300f866e45bac5a158fea83 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_209.txt @@ -0,0 +1 @@ +You can also reverse the order of the GPUs to use 2 first. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_21.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..58b42e88cf856acf4508ae68c2584a613c061f67 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_21.txt @@ -0,0 +1 @@ +If you are using ZeRO, additionally adopt techniques from the Methods and tools for efficient training on a single GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_210.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_210.txt new file mode 100644 index 0000000000000000000000000000000000000000..086de8d5f4a2e511c2ac6e62c7fba45beee1a824 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_210.txt @@ -0,0 +1 @@ +Now, the mapping is cuda:1 for GPU 0 and cuda:0 for GPU 2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_211.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_211.txt new file mode 100644 index 0000000000000000000000000000000000000000..d6eec6bacb07ab3c6c2c022531279a983c7940d5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_211.txt @@ -0,0 +1,2 @@ +CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py +You can also set the CUDA_VISIBLE_DEVICES environment variable to an empty value to create an environment without GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_212.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_212.txt new file mode 100644 index 0000000000000000000000000000000000000000..5510f610ea767e99b61ca15f52a08afb66a341ed --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_212.txt @@ -0,0 +1,3 @@ +CUDA_VISIBLE_DEVICES= python trainer-program.py + +As with any environment variable, they can be exported instead of being added to the command line. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_213.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_213.txt new file mode 100644 index 0000000000000000000000000000000000000000..8dd920fc759777147b724897ac048c005ef410f0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_213.txt @@ -0,0 +1 @@ +However, this is not recommended because it can be confusing if you forget how the environment variable was setup and you end up using the wrong GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_214.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_214.txt new file mode 100644 index 0000000000000000000000000000000000000000..83f6e07abed80bbc23d63ae3037189b247fd732e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_214.txt @@ -0,0 +1 @@ +Instead, it is common practice to set the environment variable for a specific training run on the same command line. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_215.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_215.txt new file mode 100644 index 0000000000000000000000000000000000000000..000745f5f4e14769c9b7c735bb13fff73a6ac8ba --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_215.txt @@ -0,0 +1 @@ +CUDA_DEVICE_ORDER is an alternative environment variable you can use to control how the GPUs are ordered. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_216.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_216.txt new file mode 100644 index 0000000000000000000000000000000000000000..d5ef8df5c9787340dcd953344a3ada63f9cedd74 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_216.txt @@ -0,0 +1,10 @@ +You can either order them by: + +PCIe bus ID's that matches the order of nvidia-smi and rocm-smi for NVIDIA and AMD GPUs respectively + +export CUDA_DEVICE_ORDER=PCI_BUS_ID + +GPU compute ability + +export CUDA_DEVICE_ORDER=FASTEST_FIRST +The CUDA_DEVICE_ORDER is especially useful if your training setup consists of an older and newer GPU, where the older GPU appears first, but you cannot physically swap the cards to make the newer GPU appear first. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_217.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_217.txt new file mode 100644 index 0000000000000000000000000000000000000000..1b68a4b83eb869f2e7c47f3fea4b136e8bf32671 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_217.txt @@ -0,0 +1 @@ +In this case, set CUDA_DEVICE_ORDER=FASTEST_FIRST to always use the newer and faster GPU first (nvidia-smi or rocm-smi still reports the GPUs in their PCIe order). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_218.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_218.txt new file mode 100644 index 0000000000000000000000000000000000000000..85ae09dc209f279604b6cb3e95b366ce510a90d3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_218.txt @@ -0,0 +1 @@ +Or you could also set export CUDA_VISIBLE_DEVICES=1,0. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_22.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..eedcba7d1c32667e56a4efefb39b5610da00b984 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_22.txt @@ -0,0 +1,10 @@ +Parallelization strategy for a multi-Node / multi-GPU setup + +When you have fast inter-node connectivity (e.g., NVLINK or NVSwitch) consider using one of these options: + +ZeRO - as it requires close to no modifications to the model +A combination of PipelineParallel(PP) with TensorParallel(TP) and DataParallel(DP) - this approach will result in fewer communications, but requires significant changes to the model + +When you have slow inter-node connectivity and still low on GPU memory: + +Employ a combination of DataParallel(DP) with PipelineParallel(PP), TensorParallel(TP), and ZeRO. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_23.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..4b27937f7edb628a81f970418a3ae8f6a6f471e0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_23.txt @@ -0,0 +1 @@ +In the following sections of this guide we dig deeper into how these different parallelism methods work. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_24.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..bb2607b86d69e111c5038e2c0fc8e16e0b9a8e2a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_24.txt @@ -0,0 +1,3 @@ +Data Parallelism +Even with only 2 GPUs, you can readily leverage the accelerated training capabilities offered by PyTorch's built-in features, +such as DataParallel (DP) and DistributedDataParallel (DDP). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_25.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecd40cab5d6ee0654339dc86971e3072abdba6b9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_25.txt @@ -0,0 +1,3 @@ +Note that +PyTorch documentation recommends to prefer +DistributedDataParallel (DDP) over DataParallel (DP) for multi-GPU training as it works for all models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_26.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb970dec28bae6ed41b0d86c1e1764b2579cffe4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_26.txt @@ -0,0 +1 @@ +Let's take a look at how these two methods work and what makes them different. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_27.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ac28083c08a8b65ff43046a942b87acfc2e0711 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_27.txt @@ -0,0 +1,7 @@ +DataParallel vs DistributedDataParallel +To understand the key differences in inter-GPU communication overhead between the two methods, let's review the processes per batch: +DDP: + +At the start time the main process replicates the model once from GPU 0 to the rest of GPUs +Then for each batch: +Each GPU directly consumes its mini-batch of data. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_28.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..62b1f01983ca908e5192655ecb3a50f7f5d3088a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_28.txt @@ -0,0 +1 @@ +During backward, once the local gradients are ready, they are averaged across all processes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_29.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..55a8093f94545d306132e22b99a6a417f381a05b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_29.txt @@ -0,0 +1,3 @@ +DP: +For each batch: + 1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_3.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..81d63300c3303d0d803bc0b2d448c8779055d90f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_3.txt @@ -0,0 +1,2 @@ +Transitioning from a single GPU to multiple GPUs requires the introduction of some form of parallelism, as the workload +must be distributed across the resources. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_30.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5f4477c2f425beb3daabfcb08b037d4f566b567 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_30.txt @@ -0,0 +1 @@ +GPU 0 reads the batch of data and then sends a mini-batch to each GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_31.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_31.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_32.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..3ac09160579f6d2ca97dd31a439ea3582e65c786 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_32.txt @@ -0,0 +1 @@ +The up-to-date model is replicated from GPU 0 to each GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_33.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..5224d123cf5192a2047485943dc8dc44bd3ed1e0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_33.txt @@ -0,0 +1 @@ +3. forward is executed, and output from each GPU is sent to GPU 0 to compute the loss. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_34.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..dcd249e29f9fefd5ef6445828f1394f228bd97f1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_34.txt @@ -0,0 +1 @@ +4. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_35.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..c69d3fbf2fcbee1c78279351d5348a3f0e8f36e0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_35.txt @@ -0,0 +1 @@ +The loss is distributed from GPU 0 to all GPUs, and backward is run. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_36.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..91dcb6a87071975adc555db222107a0056de804e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_36.txt @@ -0,0 +1 @@ +5. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_37.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..e65c697f63536d4be8f62e9c70d48822d461c5b6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_37.txt @@ -0,0 +1 @@ +Gradients from each GPU are sent to GPU 0 and averaged. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_38.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ad0b8cd274057e26b60970aed80a3a04c831dd7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_38.txt @@ -0,0 +1,2 @@ +Key differences include: +1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_39.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..164297cc24bbc7ee7d80292479a1d7f02bb89a27 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_39.txt @@ -0,0 +1 @@ +DDP performs only a single communication per batch - sending gradients, while DP performs five different data exchanges per batch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_4.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..86af1045707519d3984317ef3f3980438d9e04cc --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_4.txt @@ -0,0 +1,2 @@ +Multiple techniques can be employed to achieve parallelism, such as data +parallelism, tensor parallelism, and pipeline parallelism. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_40.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..3d87bba572a73c7cf949696bdaf4439d4a754129 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_40.txt @@ -0,0 +1,2 @@ +DDP copies data using torch.distributed, while DP copies data within +the process via Python threads (which introduces limitations associated with GIL). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_41.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..10cf0050d0de4e5f34778466ec39f392dce72c2c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_41.txt @@ -0,0 +1 @@ +As a result, DistributedDataParallel (DDP) is generally faster than DataParallel (DP) unless you have slow GPU card inter-connectivity. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_42.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5aa09bb183641fff6fbe16dcf2c4c3aaa816f76 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_42.txt @@ -0,0 +1 @@ +2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_43.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..064474dd2a21cfcd7c7256fa5ba95947478d7a9a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_43.txt @@ -0,0 +1 @@ +Under DP, GPU 0 performs significantly more work than other GPUs, resulting in GPU under-utilization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_44.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..1865329170cf7f963a5d2a4f2937b8973a908787 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_44.txt @@ -0,0 +1 @@ +3. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_45.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b76321090ee9ab4bc371b4d50321814e87d3610 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_45.txt @@ -0,0 +1 @@ +DDP supports distributed training across multiple machines, whereas DP does not. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_46.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5425d702e7eecb1150d5edd27595ea21ad94d80 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_46.txt @@ -0,0 +1 @@ +This is not an exhaustive list of differences between DP and DDP, however, other nuances are out of scope of this guide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_47.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..2cc46f2db36f145e59b9e6cd3496877527880040 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_47.txt @@ -0,0 +1 @@ +You can get a deeper understanding of these methods by reading this article. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_48.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..56c6c7d2aad15a1ea87f02054c8734a6d6236d03 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_48.txt @@ -0,0 +1 @@ +Let's illustrate the differences between DP and DDP with an experiment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_49.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef0f36cbcb7140183e1bf9eac6a07ffc76543362 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_49.txt @@ -0,0 +1,4 @@ +We'll benchmark the differences between DP and +DDP with an added context of NVLink presence: + +Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (NV2 in nvidia-smi topo -m). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_5.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..786a3b9fb56ad12974fb2ca9137189d91d6d892c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_5.txt @@ -0,0 +1,2 @@ +It's important to note that there isn't a one-size-fits-all +solution, and the optimal settings depend on the specific hardware configuration you are using. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_50.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..ec332d6ef2e90dd550a9cfb0e34e806149eb20bb --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_50.txt @@ -0,0 +1 @@ +Software: pytorch-1.8-to-be + cuda-11.0 / transformers==4.3.0.dev0. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_51.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..2645c63b8d7a602f7d317402021d4a4edbc3c2f2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_51.txt @@ -0,0 +1 @@ +To disable the NVLink feature on one of the benchmarks, we use NCCL_P2P_DISABLE=1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_52.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..ce55f9ebafae5f9d74e492782df828096d60a562 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_52.txt @@ -0,0 +1,32 @@ +Here is the benchmarking code and outputs: +DP +```bash +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \ +python examples/pytorch/language-modeling/run_clm.py \ +--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ +--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 +{'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69} + +DDP w/ NVlink +```bash +rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \ +torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \ +--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ +--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 +{'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69} + +DDP w/o NVlink +```bash +rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \ +torchrun --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \ +--model_name_or_path openai-community/gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \ +--do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200 +{'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69} + +Here are the same benchmarking results gathered in a table for convenience: +| Type | NVlink | Time | +| :----- | ----- | ---: | +| 2:DP | Y | 110s | +| 2:DDP | Y | 101s | +| 2:DDP | N | 131s | +As you can see, in this case DP is ~10% slower than DDP with NVlink, but ~15% faster than DDP without NVlink. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_53.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..d655e691d7f319fba4f2f605f659eef9570ac0d9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_53.txt @@ -0,0 +1,2 @@ +The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, +the more a slow link will impede the overall runtime. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_54.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..a226d08f3f5575fdff79e2f55f74011628942d6e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_54.txt @@ -0,0 +1,2 @@ +ZeRO Data Parallelism +ZeRO-powered data parallelism (ZeRO-DP) is illustrated in the following diagram from this blog post. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_55.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..1393fdfd98d767c24e531ec8e1c18776e59dda7a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_55.txt @@ -0,0 +1 @@ +While it may appear complex, it is a very similar concept to DataParallel (DP). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_56.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..ac3b0176ef85e5692e51541a7d0b121586b0dd60 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_56.txt @@ -0,0 +1,2 @@ +The difference is that instead of +replicating the full model parameters, gradients and optimizer states, each GPU stores only a slice of it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_57.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..2c2c597b782efef0e82bd1697dcc84fbd6729fe7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_57.txt @@ -0,0 +1,3 @@ +Then, at +run-time when the full layer parameters are needed just for the given layer, all GPUs synchronize to give each other +parts that they miss. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_58.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9c82c4cfd6b7fb7e9316d65af36710bd57f3828 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_58.txt @@ -0,0 +1 @@ +To illustrate this idea, consider a simple model with 3 layers (La, Lb, and Lc), where each layer has 3 parameters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_59.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc78f23f3ea44df67dc16e6f4de50a485077d764 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_59.txt @@ -0,0 +1,23 @@ +Layer La, for example, has weights a0, a1 and a2: +La | Lb | Lc +---|----|--- +a0 | b0 | c0 +a1 | b1 | c1 +a2 | b2 | c2 +If we have 3 GPUs, ZeRO-DP splits the model onto 3 GPUs like so: + +GPU0: +La | Lb | Lc +---|----|--- +a0 | b0 | c0 +GPU1: +La | Lb | Lc +---|----|--- +a1 | b1 | c1 +GPU2: +La | Lb | Lc +---|----|--- +a2 | b2 | c2 + +In a way, this is the same horizontal slicing as tensor parallelism, as opposed to Vertical +slicing, where one puts whole layer-groups on different GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_6.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b8b3224c6d0bd2de6745ddcff37ddd3c879d850 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_6.txt @@ -0,0 +1,2 @@ +This guide offers an in-depth overview of individual types of parallelism, as well as guidance on ways to combine +techniques and choosing an appropriate approach. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_60.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..140d4d58f117594642538b8f152bc4424a8093bd --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_60.txt @@ -0,0 +1,6 @@ +Now let's see how this works: +Each of these GPUs will get the usual mini-batch as it works in DP: +x0 => GPU0 +x1 => GPU1 +x2 => GPU2 +The inputs are passed without modifications as if they would be processed by the original model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_61.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..803477c8fb7b1679ce3997a114eeceb93348bfa8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_61.txt @@ -0,0 +1 @@ +First, the inputs get to the layer La. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_62.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..aba22e1e15b8c5a0b0ab7d71d9aae0ccd853ca05 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_62.txt @@ -0,0 +1 @@ +What happens at this point? \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_63.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5f6c0983746c3d4e6fbee2a2170c0854bc7fb78 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_63.txt @@ -0,0 +1 @@ +On GPU0: the x0 mini-batch requires the a0, a1, a2 parameters to do its forward path through the layer, but the GPU0 has only a0. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_64.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..6db028a6da99c2cd906e74df7b790290ec8584f6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_64.txt @@ -0,0 +1 @@ +It will get a1 from GPU1 and a2 from GPU2, bringing all the pieces of the model together. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_65.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..539d3562361ef65d6c27f46e34c01d78b84d835e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_65.txt @@ -0,0 +1 @@ +In parallel, GPU1 gets another mini-batch - x1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_66.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..8cdf4c2c1bcb3de644230dbf8bc5d199b5a44509 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_66.txt @@ -0,0 +1 @@ +GPU1 has the a1 parameter, but needs a0 and a2, so it gets those from GPU0 and GPU2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_67.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..0389b9f37bd1c3948f6ca684d8539941201d880b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_67.txt @@ -0,0 +1 @@ +Same happens to GPU2 that gets the mini-batch x2. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_68.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..2e8f4f8ff32b8085d19d29a74225d1cc369d26c9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_68.txt @@ -0,0 +1 @@ +It gets a0 and a1 from GPU0 and GPU1. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_69.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..833c72154b7dc8dad2d63ca75055de3b6011824b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_69.txt @@ -0,0 +1 @@ +This way each of the 3 GPUs gets the full tensors reconstructed and makes a forward pass with its own mini-batch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_7.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fee70544622cba8099b61eb6fd36c16d3568a3e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_7.txt @@ -0,0 +1,2 @@ +For step-by-step tutorials on distributed training, please refer to +the 🤗 Accelerate documentation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_70.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..a3b950b81f0b5d6cb54e4b5d08aec78c9c86933c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_70.txt @@ -0,0 +1 @@ +As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_71.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..0d99aea92a1aa240e2a9e57634ff891ab1554e9f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_71.txt @@ -0,0 +1 @@ +The reconstruction is done efficiently via a pre-fetch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_72.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..7dbc0e90d162e2b71b3ff93325b74135b0e55611 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_72.txt @@ -0,0 +1 @@ +Then the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_73.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..019d317a7ffc7208e15d0389e6a9d1dadbc2d131 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_73.txt @@ -0,0 +1,2 @@ +This mechanism is similar to an efficient group backpacking strategy: person A carries the tent, person B carries the stove, +and person C carries the axe. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_74.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecfd820e791e2e8ab8e5b5c32132b6f95f3e882c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_74.txt @@ -0,0 +1,2 @@ +Each night they all share what they have with others and get from others what they don't have, +and in the morning they pack up their allocated type of gear and continue on their way. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_75.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..0ef711f8e1e2bc0d389de73b6cf640744c19bfb7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_75.txt @@ -0,0 +1 @@ +This is what ZeRO DP/Sharded DDP is. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_76.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..442c1b5d426ded0bb74cfbcaa9232b9ac1212c55 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_76.txt @@ -0,0 +1,2 @@ +Compare this strategy to the simple one where each person has to carry their own tent, stove and axe (similar to +DataParallel (DP and DDP) in PyTorch), which would be far more inefficient. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_77.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..272066f7062362fe30af3fe57a97ef7e0a3a7368 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_77.txt @@ -0,0 +1 @@ +While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_78.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd0aa3ea3bac6da326840baa67674492d61104e3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_78.txt @@ -0,0 +1,2 @@ +If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism +which will be discussed later. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_79.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..c0523e76bd8c53859e88b6259eb42d83d6dc99be --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_79.txt @@ -0,0 +1,2 @@ +This is because it partitions/shards each layer's weights, unlike vertical model parallelism +which is discussed next. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_8.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..6cf2776936eefe0f5cde68cd2191500cea547f67 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_8.txt @@ -0,0 +1,2 @@ +While the main concepts discussed in this guide are likely applicable across frameworks, here we focus on +PyTorch-based implementations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_80.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..7060ffd30fc453a5f03833b5c6e894e1f155b0a3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_80.txt @@ -0,0 +1,8 @@ +Implementations: + +DeepSpeed ZeRO-DP stages 1+2+3 +Accelerate integration +transformers integration + +From Naive Model Parallelism to Pipeline Parallelism +To explain Pipeline parallelism, we'll first look into Naive Model Parallelism (MP), also known as Vertical MP. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_81.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..977bfceac8ff46b7ac93e6d61ae085b0849421d2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_81.txt @@ -0,0 +1,2 @@ +This approach +involves distributing groups of model layers across multiple GPUs by assigning specific layers to specific GPUs with .to(). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_82.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..7a90908814d035216b35e098444b01dc726ef489 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_82.txt @@ -0,0 +1 @@ +As data flows through these layers, it is moved to the same GPU as the layer, while the other layers remain untouched. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_83.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..93913bea719e52f7b2afed8241688717bc021c55 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_83.txt @@ -0,0 +1 @@ +We refer to this Model parallelism as "Vertical" because of how models are typically visualized. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_84.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..622aab1511f1c6e69f73903662f9eda9648b46d3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_84.txt @@ -0,0 +1,18 @@ +For example, the +following diagram shows an 8-layer model split vertically into two slices, placing layers 0-3 onto +GPU0 and 4-7 to GPU1: + +| Layer | | +| 0 | | +| 1 | GPU0 | +| 2 | | +| 3 | | +================ +| Layer | | +| 4 | | +| 5 | GPU1 | +| 6 | | +| 7 | | +================ + +In this example, when data moves from layer 0 to 3, it's no different from regular forward pass. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_85.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f3110610fa6d0064433c4059197dbc13005e0d1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_85.txt @@ -0,0 +1,2 @@ +However, passing data +from layer 3 to 4 requires moving it from GPU0 to GPU1, introducing a communication overhead. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_86.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b338ea7355601c4d3f628bcc619efa0aa31ee6c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_86.txt @@ -0,0 +1,2 @@ +If the participating +GPUs are on the same compute node (e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_87.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..efaff7860efa64accb18bd6782a8341d68b36c55 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_87.txt @@ -0,0 +1,2 @@ +same physical machine) this copying is fast, but if the GPUs are distributed +across different compute nodes (e.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_88.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c0edcfe6bf76e6bd2fc045e8f4bac1d38116461 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_88.txt @@ -0,0 +1 @@ +multiple machines), the communication overhead could be substantially greater. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_89.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2fb2afd7accc56ae1a077be7496b670fd885d92 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_89.txt @@ -0,0 +1 @@ +Following that, layers 4 to 7 work as they would in the original model. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_9.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b4c2de65cdaad34ab10536f15c5ee06effcd2b0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_9.txt @@ -0,0 +1,2 @@ +Before diving deeper into the specifics of each technique, let's go over the rough decision process when training +large models on a large infrastructure. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_90.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..01bbbb075da09126b3a0bdbb4bb34f6bdc32d3ae --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_90.txt @@ -0,0 +1,2 @@ +Upon completion of the 7th layer, there is often +a need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_91.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..1897dd5b8ea6edacfcfa6691682658ccaae2dd43 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_91.txt @@ -0,0 +1,2 @@ +Now the loss can be +computed and the optimizer can do its work. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_92.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..d579be6ebe100c71a91853c1544d96a795f18619 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_92.txt @@ -0,0 +1,2 @@ +Naive Model Parallelism comes several shortcomings: +- All but one GPU are idle at any given moment: if 4 GPUs are used, it's nearly identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_93.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..a04083f5df58efb9e6549676d13d9d607bff4cb9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_93.txt @@ -0,0 +1 @@ +- Overhead in data transfer between devices: E.g. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_94.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..3156a31873235dbba64b03c99e2864fbd5e26967 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_94.txt @@ -0,0 +1 @@ +4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, but a single 24GB card will complete the training faster, because it doesn't have the data copying overhead. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_95.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..dfad266409385a534b8d9389b35d45067575374c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_95.txt @@ -0,0 +1,2 @@ +But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states) +- Copying shared embeddings: Shared embeddings may need to get copied back and forth between GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_96.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..4d37cb004f9192b3f55c349f9a24bfade36ac621 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_96.txt @@ -0,0 +1 @@ +Now that you are familiar with how the naive approach to model parallelism works and its shortcomings, let's look at Pipeline Parallelism (PP). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_97.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bc1e56e782533271807afb15fdd3b4af3d70c04 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_97.txt @@ -0,0 +1,2 @@ +PP is almost identical to a naive MP, but it solves the GPU idling problem by chunking the incoming batch into micro-batches +and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_98.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..f81eb5e19e936a673b0d9ae18e7a451d0d5cdd4f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_98.txt @@ -0,0 +1,5 @@ +The following illustration from the GPipe paper +shows the naive MP on the top, and PP on the bottom: + +At the bottom of the diagram, you can observe that the Pipeline Parallelism (PP) approach minimizes the number of idle +GPU zones, referred to as 'bubbles'. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_many/chunk_99.txt b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..9d1846f1274592942a1c22f353837bce2e4d5682 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_many/chunk_99.txt @@ -0,0 +1,2 @@ +Both parts of the diagram show a parallelism level of degree 4, meaning that 4 GPUs +are involved in the pipeline. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_0.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..7771441d03152adbefeb609212ffeec02245858e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_0.txt @@ -0,0 +1,4 @@ + +Methods and tools for efficient training on a single GPU +This guide demonstrates practical techniques that you can use to increase the efficiency of your model's training by +optimizing memory utilization, speeding up the training, or both. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_1.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..048a9205a02dad9a4ca76bf76574c56ffe8c3571 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_1.txt @@ -0,0 +1,2 @@ +If you'd like to understand how GPU is utilized during +training, please refer to the Model training anatomy conceptual guide first. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_10.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..72e7ac7027d34f35d2e003b405a318ea5c97e508 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_10.txt @@ -0,0 +1,15 @@ +The methods and tools covered in this guide can be classified based on the effect they have on the training process: +| Method/tool | Improves training speed | Optimizes memory utilization | +|:-----------------------------------------------------------|:------------------------|:-----------------------------| +| Batch size choice | Yes | Yes | +| Gradient accumulation | No | Yes | +| Gradient checkpointing | No | Yes | +| Mixed precision training | Yes | (No) | +| Optimizer choice | Yes | Yes | +| Data preloading | Yes | No | +| DeepSpeed Zero | No | Yes | +| torch.compile | Yes | No | +| Parameter-Efficient Fine Tuning (PEFT) | No | Yes | + +Note: when using mixed precision with a small model and a large batch size, there will be some memory savings but with a +large model and a small batch size, the memory use will be larger. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_100.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_100.txt new file mode 100644 index 0000000000000000000000000000000000000000..68388f339816a3a9a0b99a9b4184f090d62d5828 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_100.txt @@ -0,0 +1 @@ +It should eventually become the default, but if you want to experiment with it sooner, take a look at this GitHub issue. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_101.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_101.txt new file mode 100644 index 0000000000000000000000000000000000000000..47b7ef749af14d251e064e30e0cf80e97259c165 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_101.txt @@ -0,0 +1,3 @@ +Data preloading +One of the important requirements to reach great training speed is the ability to feed the GPU at the maximum speed it +can handle. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_102.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_102.txt new file mode 100644 index 0000000000000000000000000000000000000000..601bc1b57240f12747270cc8bf8a5f5fbaf0f4bd --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_102.txt @@ -0,0 +1,2 @@ +By default, everything happens in the main process, and it might not be able to read the data from disk fast +enough, and thus create a bottleneck, leading to GPU under-utilization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_103.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_103.txt new file mode 100644 index 0000000000000000000000000000000000000000..8e1bdd043853ac62e19514bc064874bb68c5cb5c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_103.txt @@ -0,0 +1,3 @@ +Configure the following arguments to reduce the bottleneck: + +DataLoader(pin_memory=True, ) - ensures the data gets preloaded into the pinned memory on CPU and typically leads to much faster transfers from CPU to GPU memory. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_104.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_104.txt new file mode 100644 index 0000000000000000000000000000000000000000..4bc7f399492ff8cdc21186b14e87908d363507fc --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_104.txt @@ -0,0 +1 @@ +DataLoader(num_workers=4, ) - spawn several workers to preload data faster. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_105.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_105.txt new file mode 100644 index 0000000000000000000000000000000000000000..59e16fa53ad5f99a0342518deea80db069fbb304 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_105.txt @@ -0,0 +1 @@ +During training, watch the GPU utilization stats; if it's far from 100%, experiment with increasing the number of workers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_106.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_106.txt new file mode 100644 index 0000000000000000000000000000000000000000..f4041c2775f5397f0a8fa2c94ae482c85feefc1a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_106.txt @@ -0,0 +1 @@ +Of course, the problem could be elsewhere, so many workers won't necessarily lead to better performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_107.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_107.txt new file mode 100644 index 0000000000000000000000000000000000000000..f21c37a226b34eb52057b9ed01ba37eee116efbc --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_107.txt @@ -0,0 +1 @@ +When using [Trainer], the corresponding [TrainingArguments] are: dataloader_pin_memory (True by default), and dataloader_num_workers (defaults to 0). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_108.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_108.txt new file mode 100644 index 0000000000000000000000000000000000000000..f159fa0dc0b2f4704eb57f2015bc20b4e357b3d3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_108.txt @@ -0,0 +1,2 @@ +DeepSpeed ZeRO +DeepSpeed is an open-source deep learning optimization library that is integrated with 🤗 Transformers and 🤗 Accelerate. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_109.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_109.txt new file mode 100644 index 0000000000000000000000000000000000000000..f195a6fa4b16bc8a7c281d2027db41773bb09891 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_109.txt @@ -0,0 +1,2 @@ +It provides a wide range of features and optimizations designed to improve the efficiency and scalability of large-scale +deep learning training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_11.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..c5e9aef745f96b4b717b1ec5a22ff0c571f487f1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_11.txt @@ -0,0 +1 @@ +You can combine the above methods to get a cumulative effect. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_110.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_110.txt new file mode 100644 index 0000000000000000000000000000000000000000..47eb4d29d4cf8d82615717692203eceb0a4cd9b4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_110.txt @@ -0,0 +1,2 @@ +If your model fits onto a single GPU and you have enough space to fit a small batch size, you don't need to use DeepSpeed +as it'll only slow things down. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_111.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_111.txt new file mode 100644 index 0000000000000000000000000000000000000000..16705c5925d815aa9d2812576221ff94d41a0a26 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_111.txt @@ -0,0 +1,2 @@ +However, if the model doesn't fit onto a single GPU or you can't fit a small batch, you can +leverage DeepSpeed ZeRO + CPU Offload, or NVMe Offload for much larger models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_112.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_112.txt new file mode 100644 index 0000000000000000000000000000000000000000..9891cbae78776f0bacbe32568c6785fcbbb0c8dd --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_112.txt @@ -0,0 +1,6 @@ +In this case, you need to separately +install the library, then follow one of the guides to create a configuration file +and launch DeepSpeed: + +For an in-depth guide on DeepSpeed integration with [Trainer], review the corresponding documentation, specifically the +section for a single GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_113.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_113.txt new file mode 100644 index 0000000000000000000000000000000000000000..428b9af6249ca3dd4f4e816375c504e2472f6bb2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_113.txt @@ -0,0 +1 @@ +Some adjustments are required to use DeepSpeed in a notebook; please take a look at the corresponding guide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_114.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_114.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a2b118230c77a4758898205f17371210ea3cc1d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_114.txt @@ -0,0 +1 @@ +If you prefer to use 🤗 Accelerate, refer to 🤗 Accelerate DeepSpeed guide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_115.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_115.txt new file mode 100644 index 0000000000000000000000000000000000000000..82f78e422046df3c2020ac85d73df2f22ea37b93 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_115.txt @@ -0,0 +1,3 @@ +Using torch.compile +PyTorch 2.0 introduced a new compile function that doesn't require any modification to existing PyTorch code but can +optimize your code by adding a single line of code: model = torch.compile(model). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_116.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_116.txt new file mode 100644 index 0000000000000000000000000000000000000000..a5e68885eb31f1ab3d42b67f85e43e08b0fab301 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_116.txt @@ -0,0 +1,4 @@ +If using [Trainer], you only need to pass the torch_compile option in the [TrainingArguments]: +python +training_args = TrainingArguments(torch_compile=True, **default_args) +torch.compile uses Python's frame evaluation API to automatically create a graph from existing PyTorch programs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_117.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_117.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f8b7854546bfa38f0123714aba668083bc43593 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_117.txt @@ -0,0 +1,2 @@ +After +capturing the graph, different backends can be deployed to lower the graph to an optimized engine. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_118.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_118.txt new file mode 100644 index 0000000000000000000000000000000000000000..9eda94b608db104b4656181670f27c4f786198b7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_118.txt @@ -0,0 +1 @@ +You can find more details and benchmarks in PyTorch documentation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_119.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_119.txt new file mode 100644 index 0000000000000000000000000000000000000000..5147d45dde75f5e922473a9a371f1dabc12ae267 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_119.txt @@ -0,0 +1 @@ +torch.compile has a growing list of backends, which can be found in by calling torchdynamo.list_backends(), each of which with its optional dependencies. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_12.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..388ab0e38e0db71f68f650af0801f07846e7a990 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_12.txt @@ -0,0 +1,3 @@ +These techniques are available to you whether you are +training your model with [Trainer] or writing a pure PyTorch loop, in which case you can configure these optimizations +with 🤗 Accelerate. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_120.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_120.txt new file mode 100644 index 0000000000000000000000000000000000000000..98a8a9950c0aabff4b6e73f73ca8a2306217c903 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_120.txt @@ -0,0 +1 @@ +Choose which backend to use by specifying it via torch_compile_backend in the [TrainingArguments]. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_121.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_121.txt new file mode 100644 index 0000000000000000000000000000000000000000..5a8e6000e61dc2e1f08640c0a691c5949aa1910e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_121.txt @@ -0,0 +1,3 @@ +Some of the most commonly used backends are: +Debugging backends: +* dynamo.optimize("eager") - Uses PyTorch to run the extracted GraphModule. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_122.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_122.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab544edaadae56fe34ed437dae67bb994660af93 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_122.txt @@ -0,0 +1 @@ +This is quite useful in debugging TorchDynamo issues. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_123.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_123.txt new file mode 100644 index 0000000000000000000000000000000000000000..50c17e2a36b9af89f3072aef204dfa9c125c9912 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_123.txt @@ -0,0 +1 @@ +* dynamo.optimize("aot_eager") - Uses AotAutograd with no compiler, i.e, just using PyTorch eager for the AotAutograd's extracted forward and backward graphs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_124.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_124.txt new file mode 100644 index 0000000000000000000000000000000000000000..fe3773f9a0995409d18fa7e5bdc7108cdc1b0415 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_124.txt @@ -0,0 +1 @@ +This is useful for debugging, and unlikely to give speedups. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_125.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_125.txt new file mode 100644 index 0000000000000000000000000000000000000000..7097b272f81cd247fe5cb7694d34a13451295dd1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_125.txt @@ -0,0 +1,3 @@ +Training & inference backends: +* dynamo.optimize("inductor") - Uses TorchInductor backend with AotAutograd and cudagraphs by leveraging codegened Triton kernels Read more +* dynamo.optimize("nvfuser") - nvFuser with TorchScript. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_126.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_126.txt new file mode 100644 index 0000000000000000000000000000000000000000..485996eeb46f8cffc64b88d9aab24ebade78a0e8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_126.txt @@ -0,0 +1,2 @@ +Read more +* dynamo.optimize("aot_nvfuser") - nvFuser with AotAutograd. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_127.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_127.txt new file mode 100644 index 0000000000000000000000000000000000000000..1207ffa4a4991544cf5c28ffafb7c36b5884c0f8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_127.txt @@ -0,0 +1,2 @@ +Read more +* dynamo.optimize("aot_cudagraphs") - cudagraphs with AotAutograd. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_128.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_128.txt new file mode 100644 index 0000000000000000000000000000000000000000..136888d80218406711af457df6c88c7bd0e6c1fc --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_128.txt @@ -0,0 +1,3 @@ +Read more +Inference-only backends: +* dynamo.optimize("ofi") - Uses Torchscript optimize_for_inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_129.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_129.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b17697657d31c392fd5bd1a4d43984c6ed52a69 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_129.txt @@ -0,0 +1,2 @@ +Read more +* dynamo.optimize("fx2trt") - Uses NVIDIA TensorRT for inference optimizations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_13.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3df3c53ba32d8fad2d8200007b94933da718fec --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_13.txt @@ -0,0 +1,6 @@ +If these methods do not result in sufficient gains, you can explore the following options: +* Look into building your own custom Docker container with efficient softare prebuilds +* Consider a model that uses Mixture of Experts (MoE) +* Convert your model to BetterTransformer to leverage PyTorch native attention +Finally, if all of the above is still not enough, even after switching to a server-grade GPU like A100, consider moving +to a multi-GPU setup. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_130.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_130.txt new file mode 100644 index 0000000000000000000000000000000000000000..dc2699b612c999623071e02f13c66d8f0913f934 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_130.txt @@ -0,0 +1,2 @@ +Read more +* dynamo.optimize("onnxrt") - Uses ONNXRT for inference on CPU/GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_131.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_131.txt new file mode 100644 index 0000000000000000000000000000000000000000..0c587dd97c1a076c03f38522758cf5dae0cfe202 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_131.txt @@ -0,0 +1,2 @@ +Read more +* dynamo.optimize("ipex") - Uses IPEX for inference on CPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_132.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_132.txt new file mode 100644 index 0000000000000000000000000000000000000000..44a7a59fd4092cbc93a291ca53c9b241438612c7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_132.txt @@ -0,0 +1,4 @@ +Read more +For an example of using torch.compile with 🤗 Transformers, check out this blog post on fine-tuning a BERT model for Text Classification using the newest PyTorch 2.0 features +Using 🤗 PEFT +Parameter-Efficient Fine Tuning (PEFT) methods freeze the pretrained model parameters during fine-tuning and add a small number of trainable parameters (the adapters) on top of it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_133.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_133.txt new file mode 100644 index 0000000000000000000000000000000000000000..66d05d6ace0027e53674bc7e6a38905a07711bf9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_133.txt @@ -0,0 +1 @@ +As a result the memory associated to the optimizer states and gradients are greatly reduced. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_134.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_134.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff65a1c836c3a679b50ad68119fc0620a2e10248 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_134.txt @@ -0,0 +1,5 @@ +For example with a vanilla AdamW, the memory requirement for the optimizer state would be: +* fp32 copy of parameters: 4 bytes/param +* Momentum: 4 bytes/param +* Variance: 4 bytes/param +Suppose a model with 7B parameters and 200 millions parameters injected with Low Rank Adapters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_135.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_135.txt new file mode 100644 index 0000000000000000000000000000000000000000..8be3b2ac985f50f84d3d789277fc8e98c3b4ac83 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_135.txt @@ -0,0 +1 @@ +The memory requirement for the optimizer state of the plain model would be 12 * 7 = 84 GB (assuming 7B trainable parameters). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_136.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_136.txt new file mode 100644 index 0000000000000000000000000000000000000000..eeb92ee8bacd414d60c298d0a079de9ff3907d7a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_136.txt @@ -0,0 +1 @@ +Adding Lora increases slightly the memory associated to the model weights and substantially decreases memory requirement for the optimizer state to 12 * 0.2 = 2.4GB. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_137.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_137.txt new file mode 100644 index 0000000000000000000000000000000000000000..8827cd7d53bd45418be22c4787395a97c670d05b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_137.txt @@ -0,0 +1 @@ +Read more about PEFT and its detailed usage in the PEFT documentation or PEFT repository. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_138.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_138.txt new file mode 100644 index 0000000000000000000000000000000000000000..d12ea538d26afc545311d675fafc3a8838c184b0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_138.txt @@ -0,0 +1,3 @@ +Using 🤗 Accelerate +With 🤗 Accelerate you can use the above methods while gaining full +control over the training loop and can essentially write the loop in pure PyTorch with some minor modifications. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_139.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_139.txt new file mode 100644 index 0000000000000000000000000000000000000000..3a44e840ebdc3bb8689daa82d80bbf680bbd3fef --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_139.txt @@ -0,0 +1,28 @@ +Suppose you have combined the methods in the [TrainingArguments] like so: +py +training_args = TrainingArguments( + per_device_train_batch_size=1, + gradient_accumulation_steps=4, + gradient_checkpointing=True, + fp16=True, + **default_args, +) +The full example training loop with 🤗 Accelerate is only a handful of lines of code long: + +from accelerate import Accelerator +from torch.utils.data.dataloader import DataLoader +dataloader = DataLoader(ds, batch_size=training_args.per_device_train_batch_size) +if training_args.gradient_checkpointing: + model.gradient_checkpointing_enable() +accelerator = Accelerator(fp16=training_args.fp16) +model, optimizer, dataloader = accelerator.prepare(model, adam_bnb_optim, dataloader) +model.train() +for step, batch in enumerate(dataloader, start=1): + loss = model(**batch).loss + loss = loss / training_args.gradient_accumulation_steps + accelerator.backward(loss) + if step % training_args.gradient_accumulation_steps == 0: + optimizer.step() + optimizer.zero_grad() + +First we wrap the dataset in a DataLoader. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_14.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..d7106613742593764c70fd630455b9f402e5aa51 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_14.txt @@ -0,0 +1,2 @@ +All these approaches are still valid in a multi-GPU setup, plus you can leverage additional parallelism +techniques outlined in the multi-GPU section. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_140.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_140.txt new file mode 100644 index 0000000000000000000000000000000000000000..5222f55009dd53c6f75941e7041fa69f64e72c35 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_140.txt @@ -0,0 +1 @@ +Then we can enable gradient checkpointing by calling the model's [~PreTrainedModel.gradient_checkpointing_enable] method. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_141.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_141.txt new file mode 100644 index 0000000000000000000000000000000000000000..79869703de0e4d0496a294364eac6c704cd4efad --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_141.txt @@ -0,0 +1,2 @@ +When we initialize the Accelerator +we can specify if we want to use mixed precision training and it will take care of it for us in the [prepare] call. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_142.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_142.txt new file mode 100644 index 0000000000000000000000000000000000000000..d8dfc6f22f3dba0949fc5d2acae860e6760d52c3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_142.txt @@ -0,0 +1,2 @@ +During the prepare +call the dataloader will also be distributed across workers should we use multiple GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_143.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_143.txt new file mode 100644 index 0000000000000000000000000000000000000000..082aa634c63ee3d78ccd6f95576f65873f9946d7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_143.txt @@ -0,0 +1 @@ +We use the same 8-bit optimizer from the earlier example. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_144.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_144.txt new file mode 100644 index 0000000000000000000000000000000000000000..ca829159eb852ac8dd486db27a90383a3bc49437 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_144.txt @@ -0,0 +1 @@ +Finally, we can add the main training loop. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_145.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_145.txt new file mode 100644 index 0000000000000000000000000000000000000000..af269a8c767e81cf774d1c9b2182a85ffbdcc2a9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_145.txt @@ -0,0 +1 @@ +Note that the backward call is handled by 🤗 Accelerate. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_146.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_146.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd357f2f35af88ce1b2c089d553a6d881a780b40 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_146.txt @@ -0,0 +1,3 @@ +We can also see +how gradient accumulation works: we normalize the loss, so we get the average at the end of accumulation and once we have +enough steps we run the optimization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_147.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_147.txt new file mode 100644 index 0000000000000000000000000000000000000000..632b4670f03a551515cd44aa93f6c375af24eb70 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_147.txt @@ -0,0 +1,2 @@ +Implementing these optimization techniques with 🤗 Accelerate only takes a handful of lines of code and comes with the +benefit of more flexibility in the training loop. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_148.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_148.txt new file mode 100644 index 0000000000000000000000000000000000000000..018e15144ddf3e4e24ddac8e1a6f91e18421e091 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_148.txt @@ -0,0 +1,2 @@ +For a full documentation of all features have a look at the +Accelerate documentation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_149.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_149.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa137559f6b396a5ee302e7950ed7d950132f79c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_149.txt @@ -0,0 +1,3 @@ +Efficient Software Prebuilds +PyTorch's pip and conda builds come prebuilt with the cuda toolkit +which is enough to run PyTorch, but it is insufficient if you need to build cuda extensions. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_15.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..7fb41d893f7bfe5279dcaa1606d5253189e1f5c8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_15.txt @@ -0,0 +1,2 @@ +Batch size choice +To achieve optimal performance, start by identifying the appropriate batch size. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_150.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_150.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a3c8abfb291bddf168cae86ab20211877d49c9a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_150.txt @@ -0,0 +1 @@ +At times, additional efforts may be required to pre-build some components. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_151.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_151.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbaae54ad515d7652d869e6df41b7c7f2f72d451 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_151.txt @@ -0,0 +1,2 @@ +For instance, if you're using libraries like apex that +don't come pre-compiled. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_152.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_152.txt new file mode 100644 index 0000000000000000000000000000000000000000..129dafa84d824e42d991c877dc01e86010291dc4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_152.txt @@ -0,0 +1 @@ +In other situations figuring out how to install the right cuda toolkit system-wide can be complicated. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_153.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_153.txt new file mode 100644 index 0000000000000000000000000000000000000000..a168266e820aa4494abc6b2187511ee752130474 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_153.txt @@ -0,0 +1,2 @@ +To address these scenarios PyTorch and NVIDIA released a new version of NGC docker container which already comes with +everything prebuilt. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_154.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_154.txt new file mode 100644 index 0000000000000000000000000000000000000000..35c60f8e493bef4650cde888bd8e2d117543103c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_154.txt @@ -0,0 +1 @@ +You just need to install your programs on it, and it will run out of the box. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_155.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_155.txt new file mode 100644 index 0000000000000000000000000000000000000000..2a87c038f3e97f78fe616639fd73b9d76f6cd093 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_155.txt @@ -0,0 +1 @@ +This approach is also useful if you want to tweak the pytorch source and/or make a new customized build. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_156.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_156.txt new file mode 100644 index 0000000000000000000000000000000000000000..05dcee7485912f6095d6c5edc5eb5de69313ee1e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_156.txt @@ -0,0 +1,2 @@ +To find the docker image version you want start with PyTorch release notes, +choose one of the latest monthly releases. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_157.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_157.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8841fe29e8d97125cf37f47c6edf00c0155351e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_157.txt @@ -0,0 +1,2 @@ +Go into the release's notes for the desired release, check that the environment's +components are matching your needs (including NVIDIA Driver requirements!) \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_158.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_158.txt new file mode 100644 index 0000000000000000000000000000000000000000..43972ed72ed7ff75682dd6ac5c5822ec9beddf5e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_158.txt @@ -0,0 +1,2 @@ +and then at the very top of that document go +to the corresponding NGC page. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_159.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_159.txt new file mode 100644 index 0000000000000000000000000000000000000000..86cd41744b35908301ee40e842b8255747967747 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_159.txt @@ -0,0 +1 @@ +If for some reason you get lost, here is the index of all PyTorch NGC images. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_16.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..27ca23fd0d9e5497045b9508d4abba4c5facbe2a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_16.txt @@ -0,0 +1,2 @@ +It is recommended to use batch sizes and +input/output neuron counts that are of size 2^N. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_160.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_160.txt new file mode 100644 index 0000000000000000000000000000000000000000..00c25196874c1b6078fb51bd73b1d7b221e7abc5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_160.txt @@ -0,0 +1 @@ +Next follow the instructions to download and deploy the docker image. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_161.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_161.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab87e2e0c00dc38c22d789ce458a185f1f0b6148 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_161.txt @@ -0,0 +1,3 @@ +Mixture of Experts +Some recent papers reported a 4-5x training speedup and a faster inference by integrating +Mixture of Experts (MoE) into the Transformer models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_162.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_162.txt new file mode 100644 index 0000000000000000000000000000000000000000..473b259e4c3cf598ca3c6d26900042b094061d79 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_162.txt @@ -0,0 +1,2 @@ +Since it has been discovered that more parameters lead to better performance, this technique allows to increase the +number of parameters by an order of magnitude without increasing training costs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_163.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_163.txt new file mode 100644 index 0000000000000000000000000000000000000000..86b93f9a1f71b4d4618f6c9bfe7befb02f6d0d2d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_163.txt @@ -0,0 +1,2 @@ +In this approach every other FFN layer is replaced with a MoE Layer which consists of many experts, with a gated function +that trains each expert in a balanced way depending on the input token's position in a sequence. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_164.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_164.txt new file mode 100644 index 0000000000000000000000000000000000000000..85c4248bb4fd1109969bd973ddf4b25ebbb423a7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_164.txt @@ -0,0 +1,2 @@ +(source: GLAM) +You can find exhaustive details and comparison tables in the papers listed at the end of this section. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_165.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_165.txt new file mode 100644 index 0000000000000000000000000000000000000000..e52fbd3a589c18f6cbd8c2b8a86e476cc67e0016 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_165.txt @@ -0,0 +1,2 @@ +The main drawback of this approach is that it requires staggering amounts of GPU memory - almost an order of magnitude +larger than its dense equivalent. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_166.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_166.txt new file mode 100644 index 0000000000000000000000000000000000000000..e34a53b667bc61d6546c6aa86c0e755908f2c72a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_166.txt @@ -0,0 +1 @@ +Various distillation and approaches are proposed to how to overcome the much higher memory requirements. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_167.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_167.txt new file mode 100644 index 0000000000000000000000000000000000000000..3cf863fd4c14e9a04522063a8bdb5750b307380c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_167.txt @@ -0,0 +1,3 @@ +There is direct trade-off though, you can use just a few experts with a 2-3x smaller base model instead of dozens or +hundreds experts leading to a 5x smaller model and thus increase the training speed moderately while increasing the +memory requirements moderately as well. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_168.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_168.txt new file mode 100644 index 0000000000000000000000000000000000000000..38c41db350d782bde7c20629ec30ed3cf39e2b7e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_168.txt @@ -0,0 +1,7 @@ +Most related papers and implementations are built around Tensorflow/TPUs: + +GShard: Scaling Giant Models with Conditional Computation and Automatic Sharding +Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity +GLaM: Generalist Language Model (GLaM) + +And for Pytorch DeepSpeed has built one as well: DeepSpeed-MoE: Advancing Mixture-of-Experts Inference and Training to Power Next-Generation AI Scale, Mixture of Experts - blog posts: 1, 2 and specific deployment with large transformer-based natural language generation models: blog post, Megatron-Deepspeed branch. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_169.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_169.txt new file mode 100644 index 0000000000000000000000000000000000000000..d2ccf3e841c636e334a29778ff2e5a8829fe8ae4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_169.txt @@ -0,0 +1,3 @@ +Using PyTorch native attention and Flash Attention +PyTorch 2.0 released a native torch.nn.functional.scaled_dot_product_attention (SDPA), +that allows using fused GPU kernels such as memory-efficient attention and flash attention. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_17.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..7b5a275028ce21fddba082e0d3299e7d9254cfcf --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_17.txt @@ -0,0 +1,2 @@ +Often it's a multiple of 8, but it can be +higher depending on the hardware being used and the model's dtype. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_170.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_170.txt new file mode 100644 index 0000000000000000000000000000000000000000..6879cec16275075331ebbcf87ad914d6b13c767e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_170.txt @@ -0,0 +1,5 @@ +After installing the optimum package, the relevant internal modules can be +replaced to use PyTorch's native attention with: +python +model = model.to_bettertransformer() +Once converted, train the model as usual. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_171.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_171.txt new file mode 100644 index 0000000000000000000000000000000000000000..3e92ffe610162b14e8b74f86fa637baef1c31bac --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_171.txt @@ -0,0 +1 @@ +The PyTorch-native scaled_dot_product_attention operator can only dispatch to Flash Attention if no attention_mask is provided. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_172.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_172.txt new file mode 100644 index 0000000000000000000000000000000000000000..5225e961c28cd70c8361d1c40b6bf273a636a49d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_172.txt @@ -0,0 +1 @@ +By default, in training mode, the BetterTransformer integration drops the mask support and can only be used for training that does not require a padding mask for batched training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_173.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_173.txt new file mode 100644 index 0000000000000000000000000000000000000000..28d0e2bc324eaafdd47558544e54f7881ffd543a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_173.txt @@ -0,0 +1 @@ +This is the case, for example, during masked language modeling or causal language modeling. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_174.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_174.txt new file mode 100644 index 0000000000000000000000000000000000000000..3fe22cd3590e2c637a4894d6bc6a41f61e85c563 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_174.txt @@ -0,0 +1 @@ +BetterTransformer is not suited for fine-tuning models on tasks that require a padding mask. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_175.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_175.txt new file mode 100644 index 0000000000000000000000000000000000000000..82440bbed8a91eecaede9cd7129bd53f7a396ba2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_175.txt @@ -0,0 +1 @@ +Check out this blogpost to learn more about acceleration and memory-savings with SDPA. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_18.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..7101597b36beef8f4ae9df21e1f57def7dcda6c6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_18.txt @@ -0,0 +1,3 @@ +For reference, check out NVIDIA's recommendation for input/output neuron counts and +batch size for +fully connected layers (which are involved in GEMMs (General Matrix Multiplications)). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_19.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..9c6d6118d3241c99146fe6a4b47a217aa1783595 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_19.txt @@ -0,0 +1,2 @@ +Tensor Core Requirements +define the multiplier based on the dtype and the hardware. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_2.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..348ff2a463eec3bbee7bc4493803a3f5ce6c1b15 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_2.txt @@ -0,0 +1,2 @@ +This guide +focuses on practical techniques. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_20.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5b900eaef3ea44d1570c88591860c5f0c67a77f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_20.txt @@ -0,0 +1,2 @@ +For instance, for fp16 data type a multiple of 8 is recommended, unless +it's an A100 GPU, in which case use multiples of 64. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_21.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..f796ad58de6095c01b887f5bb07d6364bff75753 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_21.txt @@ -0,0 +1 @@ +For parameters that are small, consider also Dimension Quantization Effects. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_22.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a38c28d2af9c65961feac06829e06bc5da1818d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_22.txt @@ -0,0 +1 @@ +This is where tiling happens and the right multiplier can have a significant speedup. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_23.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..d3330a48e8d682a750e4b37b34bd18bf8646d6e3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_23.txt @@ -0,0 +1,3 @@ +Gradient Accumulation +The gradient accumulation method aims to calculate gradients in smaller increments instead of computing them for the +entire batch at once. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_24.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..294a48aac68881857f4719343a75e3a3866385dd --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_24.txt @@ -0,0 +1,2 @@ +This approach involves iteratively calculating gradients in smaller batches by performing forward +and backward passes through the model and accumulating the gradients during the process. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_25.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..cebd1c64689982b12b08c510c0ff192bd2229e0e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_25.txt @@ -0,0 +1,2 @@ +Once a sufficient number of +gradients have been accumulated, the model's optimization step is executed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_26.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..e0b253ecba3f402fa3a947fd972d5fc7a07105a1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_26.txt @@ -0,0 +1,2 @@ +By employing gradient accumulation, it +becomes possible to increase the effective batch size beyond the limitations imposed by the GPU's memory capacity. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_27.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..5cfc7b6f2ee4413856b1c4ecb0bfabcd7070ce9b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_27.txt @@ -0,0 +1,2 @@ +However, it is important to note that the additional forward and backward passes introduced by gradient accumulation can +slow down the training process. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_28.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4c447ba576fbd41fd5badef6415e13cfc665fa8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_28.txt @@ -0,0 +1,4 @@ +You can enable gradient accumulation by adding the gradient_accumulation_steps argument to [TrainingArguments]: +py +training_args = TrainingArguments(per_device_train_batch_size=1, gradient_accumulation_steps=4, **default_args) +In the above example, your effective batch size becomes 4. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_29.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5b43672db00226ffd370d0e2a395633cf7d64fe --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_29.txt @@ -0,0 +1 @@ +Alternatively, use 🤗 Accelerate to gain full control over the training loop. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_3.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..7ba291b0aacd490f6f66ecae6d51a04aaf94e25f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_3.txt @@ -0,0 +1 @@ +If you have access to a machine with multiple GPUs, these approaches are still valid, plus you can leverage additional methods outlined in the multi-GPU section. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_30.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..8304e25e0a56f92a03ef8a44a4ea8911400acc91 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_30.txt @@ -0,0 +1,2 @@ +Find the 🤗 Accelerate example +further down in this guide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_31.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..13b3f35ac80c8d2dc35e5683ff788328d57ddfc9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_31.txt @@ -0,0 +1,2 @@ +While it is advised to max out GPU usage as much as possible, a high number of gradient accumulation steps can +result in a more pronounced training slowdown. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_32.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..b645bc3d2df1dd8e5faea24bfbb868c708fecbfc --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_32.txt @@ -0,0 +1 @@ +Consider the following example. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_33.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..4f0f9c52cd3e1df03f1e3f0090d8aeab830fcd5c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_33.txt @@ -0,0 +1,2 @@ +Let's say, the per_device_train_batch_size=4 +without gradient accumulation hits the GPU's limit. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_34.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..3cce299fd530add59e3f714d7d5b4cb6f963c156 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_34.txt @@ -0,0 +1,2 @@ +If you would like to train with batches of size 64, do not set the +per_device_train_batch_size to 1 and gradient_accumulation_steps to 64. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_35.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed90fd6c936f754ac736e92b49ac89a2d6001563 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_35.txt @@ -0,0 +1,2 @@ +Instead, keep per_device_train_batch_size=4 +and set gradient_accumulation_steps=16. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_36.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..99546ad93fedd55956913b0db55ff2a44871c69e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_36.txt @@ -0,0 +1,2 @@ +This results in the same effective batch size while making better use of +the available GPU resources. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_37.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..36415cefcf4e37fc8273544133d39fed50809b5e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_37.txt @@ -0,0 +1,2 @@ +For additional information, please refer to batch size and gradient accumulation benchmarks for RTX-3090 +and A100. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_38.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..45c6dfb9d29bfc98d12825a27707dc52ec4319fc --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_38.txt @@ -0,0 +1,2 @@ +Gradient Checkpointing +Some large models may still face memory issues even when the batch size is set to 1 and gradient accumulation is used. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_39.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..bd76b751e3d530ceddb8e2ce7b362331b78b1cf0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_39.txt @@ -0,0 +1 @@ +This is because there are other components that also require memory storage. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_4.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..55d4d7921499c3c9da62118011640aa703f04d5a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_4.txt @@ -0,0 +1,6 @@ +When training large models, there are two aspects that should be considered at the same time: + +Data throughput/training time +Model performance + +Maximizing the throughput (samples/second) leads to lower training cost. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_40.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a055632f7f0a4d3554cd205854e3d465bb1aaed --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_40.txt @@ -0,0 +1,2 @@ +Saving all activations from the forward pass in order to compute the gradients during the backward pass can result in +significant memory overhead. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_41.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1e7a1db474606e8b634c366adb9cdd757ffc2f9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_41.txt @@ -0,0 +1,2 @@ +The alternative approach of discarding the activations and recalculating them when needed +during the backward pass, would introduce a considerable computational overhead and slow down the training process. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_42.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c28136eac8a7b54e66ffd48e245a2624b584da4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_42.txt @@ -0,0 +1,2 @@ +Gradient checkpointing offers a compromise between these two approaches and saves strategically selected activations +throughout the computational graph so only a fraction of the activations need to be re-computed for the gradients. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_43.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..09f32802a7be02879ea885e26cd7af96284f6af2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_43.txt @@ -0,0 +1,2 @@ +For +an in-depth explanation of gradient checkpointing, refer to this great article. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_44.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..545c9a81cf709d8155bec09fbd5e4b575c50db31 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_44.txt @@ -0,0 +1,6 @@ +To enable gradient checkpointing in the [Trainer], pass the corresponding a flag to [TrainingArguments]: +py +training_args = TrainingArguments( + per_device_train_batch_size=1, gradient_accumulation_steps=4, gradient_checkpointing=True, **default_args +) +Alternatively, use 🤗 Accelerate - find the 🤗 Accelerate example further in this guide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_45.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..a78c2350bdcb205c81a55e83da9f656c64d4823d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_45.txt @@ -0,0 +1 @@ +While gradient checkpointing may improve memory efficiency, it slows training by approximately 20%. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_46.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7869a9a871444554ae2d65e1c7feb8532eb85a7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_46.txt @@ -0,0 +1,3 @@ +Mixed precision training +Mixed precision training is a technique that aims to optimize the computational efficiency of training models by +utilizing lower-precision numerical formats for certain variables. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_47.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..32db3745aa02e77f0615b0eb606becdda82d6259 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_47.txt @@ -0,0 +1,2 @@ +Traditionally, most models use 32-bit floating point +precision (fp32 or float32) to represent and process variables. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_48.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..e79a50deaea61fa9d96ba671bf15b2818ea0c411 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_48.txt @@ -0,0 +1,2 @@ +However, not all variables require this high precision +level to achieve accurate results. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_49.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d2b596402c8ac38ba30b173e154796bcce11556 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_49.txt @@ -0,0 +1,2 @@ +By reducing the precision of certain variables to lower numerical formats like 16-bit +floating point (fp16 or float16), we can speed up the computations. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_5.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..528dad8df8fa9fe174995e284ebbff68fa5b4a36 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_5.txt @@ -0,0 +1,2 @@ +This is generally achieved by utilizing the GPU +as much as possible and thus filling GPU memory to its limit. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_50.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..1075aff2d176760c32bddf6e20eec6fba58ef7ba --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_50.txt @@ -0,0 +1,2 @@ +Because in this approach some computations are performed +in half-precision, while some are still in full precision, the approach is called mixed precision training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_51.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..05fb6cc5ff7db9928e3bd597aa65d56f38f5883a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_51.txt @@ -0,0 +1,2 @@ +Most commonly mixed precision training is achieved by using fp16 (float16) data types, however, some GPU architectures +(such as the Ampere architecture) offer bf16 and tf32 (CUDA internal data type) data types. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_52.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b622360fbb21007d1473ab4e53488ab33ee91f8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_52.txt @@ -0,0 +1,3 @@ +Check +out the NVIDIA Blog to learn more about +the differences between these data types. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_53.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..7493325e9634d4c29afbfcaf54079e57c8a96c8a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_53.txt @@ -0,0 +1,2 @@ +fp16 +The main advantage of mixed precision training comes from saving the activations in half precision (fp16). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_54.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..e7d6385964e379112b3298fdf4881ecac6570f71 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_54.txt @@ -0,0 +1,2 @@ +Although the gradients are also computed in half precision they are converted back to full precision for the optimization +step so no memory is saved here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_55.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..d49c260263203843ce69ade21c1c8efe489a84c9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_55.txt @@ -0,0 +1 @@ +While mixed precision training results in faster computations, it can also lead to more GPU memory being utilized, especially for small batch sizes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_56.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..0585d042eee4af1493a1751777dda2eca56dcd6e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_56.txt @@ -0,0 +1 @@ +This is because the model is now present on the GPU in both 16-bit and 32-bit precision (1.5x the original model on the GPU). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_57.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..5b4f9d0f566827a2f07a3129422fb0f285a7c2b1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_57.txt @@ -0,0 +1,4 @@ +To enable mixed precision training, set the fp16 flag to True: +py +training_args = TrainingArguments(per_device_train_batch_size=4, fp16=True, **default_args) +If you prefer to use 🤗 Accelerate, find the 🤗 Accelerate example further in this guide. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_58.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..4856dc3f031bcc47c72e1f518c187d2a8c09c461 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_58.txt @@ -0,0 +1,2 @@ +BF16 +If you have access to an Ampere or newer hardware you can use bf16 for mixed precision training and evaluation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_59.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..c371b73a7de4fb45f21ceb6d0a1055d3697202c3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_59.txt @@ -0,0 +1,2 @@ +While +bf16 has a worse precision than fp16, it has a much bigger dynamic range. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_6.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..aff51e01a16fe1741084b4fb2ba2f12de13ff0db --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_6.txt @@ -0,0 +1,2 @@ +If the desired batch size exceeds the limits of the GPU memory, +the memory optimization techniques, such as gradient accumulation, can help. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_60.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..97073c879b0ad02ba0c69df15bdc74880ae4c4b4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_60.txt @@ -0,0 +1,2 @@ +In fp16 the biggest number you can have +is 65535 and any number above that will result in an overflow. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_61.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..71a32f8450e009e42d34b515ea5d89ee95855343 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_61.txt @@ -0,0 +1 @@ +A bf16 number can be as large as 3.39e+38 (!) \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_62.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..e89015e61dfe5e55f388ec96cb1f44f932231475 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_62.txt @@ -0,0 +1,2 @@ +which +is about the same as fp32 - because both have 8-bits used for the numerical range. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_63.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..7f252f1ee500b166ab11c519fdd7bb8e7301fbb4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_63.txt @@ -0,0 +1,5 @@ +You can enable BF16 in the 🤗 Trainer with: +python +training_args = TrainingArguments(bf16=True, **default_args) +TF32 +The Ampere hardware uses a magical data type called tf32. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_64.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..58b5c49c4288eec145e72092fc12fef1682d25dd --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_64.txt @@ -0,0 +1,2 @@ +It has the same numerical range as fp32 (8-bits), but instead +of 23 bits precision it has only 10 bits (same as fp16) and uses only 19 bits in total. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_65.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..5d5db5233b9c7ac08898a4b4e2f671fff87bc13f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_65.txt @@ -0,0 +1,3 @@ +It's "magical" in the sense that +you can use the normal fp32 training and/or inference code and by enabling tf32 support you can get up to 3x throughput +improvement. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_66.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..31f36788bb7fb09aae696da4776618f5d301c8db --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_66.txt @@ -0,0 +1,6 @@ +All you need to do is to add the following to your code: +python +import torch +torch.backends.cuda.matmul.allow_tf32 = True +torch.backends.cudnn.allow_tf32 = True +CUDA will automatically switch to using tf32 instead of fp32 where possible, assuming that the used GPU is from the Ampere series. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_67.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a28ff765451ac40df86030ede73f6b2e47bf106 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_67.txt @@ -0,0 +1,2 @@ +According to NVIDIA research, the +majority of machine learning training workloads show the same perplexity and convergence with tf32 training as with fp32. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_68.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..e41b32c42f45f5132d2b9be0b601ab24f5cf6f34 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_68.txt @@ -0,0 +1 @@ +If you're already using fp16 or bf16 mixed precision it may help with the throughput as well. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_69.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..b622ddaceafde63c155a998cf67a09fc6e897f43 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_69.txt @@ -0,0 +1,5 @@ +You can enable this mode in the 🤗 Trainer: +python +TrainingArguments(tf32=True, **default_args) + +tf32 can't be accessed directly via tensor.to(dtype=torch.tf32) because it is an internal CUDA data type. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_7.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..80de6442b2ce37587650d371df313dd6feba55fb --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_7.txt @@ -0,0 +1,2 @@ +However, if the preferred batch size fits into memory, there's no reason to apply memory-optimizing techniques because they can +slow down the training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_70.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..a0ceabe9bd055e5cf33ce692514100c742c565e2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_70.txt @@ -0,0 +1 @@ +You need torch>=1.7 to use tf32 data types. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_71.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..2acc4b257073c4d1b429ad547eaa7158ece6d24d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_71.txt @@ -0,0 +1,3 @@ +For additional information on tf32 vs other precisions, please refer to the following benchmarks: +RTX-3090 and +A100. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_72.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..f324d9e7b2f4558392d16f7e8a2a0678320991cc --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_72.txt @@ -0,0 +1,2 @@ +Flash Attention 2 +You can speedup the training throughput by using Flash Attention 2 integration in transformers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_73.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..46e539336c5f8ddb503b8ea37ce1355e4aeffef0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_73.txt @@ -0,0 +1 @@ +Check out the appropriate section in the single GPU section to learn more about how to load a model with Flash Attention 2 modules. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_74.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..4fddb870ab02f6a093c56520343a2bfe2a2bd93e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_74.txt @@ -0,0 +1,2 @@ +Optimizer choice +The most common optimizer used to train transformer models is Adam or AdamW (Adam with weight decay). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_75.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..e728cf1baaec3a84ab4be1faa4dfd3d586095ebb --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_75.txt @@ -0,0 +1,3 @@ +Adam achieves +good convergence by storing the rolling average of the previous gradients; however, it adds an additional memory +footprint of the order of the number of model parameters. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_76.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..a6b33cae72829a11cedd9687e7445f57562b8473 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_76.txt @@ -0,0 +1 @@ +To remedy this, you can use an alternative optimizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_77.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..efe425b9f8b8f07037635821f1f6314eb97ae8e4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_77.txt @@ -0,0 +1,2 @@ +For example if you have NVIDIA/apex installed for NVIDIA GPUs, or ROCmSoftwarePlatform/apex for AMD GPUs, adamw_apex_fused will give you the +fastest training experience among all supported AdamW optimizers. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_78.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b125e53ffa7fe60f2f37c8b18b6ea7b0361adb3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_78.txt @@ -0,0 +1,2 @@ +[Trainer] integrates a variety of optimizers that can be used out of box: adamw_hf, adamw_torch, adamw_torch_fused, +adamw_apex_fused, adamw_anyprecision, adafactor, or adamw_bnb_8bit. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_79.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..c46311b241db9e76f7ea9d867f51fdedf385b307 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_79.txt @@ -0,0 +1 @@ +More optimizers can be plugged in via a third-party implementation. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_8.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..4a0f1ca8bd42744eb63e274ee7a0de504b557029 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_8.txt @@ -0,0 +1 @@ +Just because one can use a large batch size, does not necessarily mean they should. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_80.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..86350d7ef6eb4f4f3fe5b4058bd1edb2c49ac34e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_80.txt @@ -0,0 +1,3 @@ +Let's take a closer look at two alternatives to AdamW optimizer: +1. adafactor which is available in [Trainer] +2. adamw_bnb_8bit is also available in Trainer, but a third-party integration is provided below for demonstration. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_81.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..30cb6aa2b38f50c58f532fe89296309b194891e0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_81.txt @@ -0,0 +1,3 @@ +For comparison, for a 3B-parameter model, like “google-t5/t5-3bâ€: +* A standard AdamW optimizer will need 24GB of GPU memory because it uses 8 bytes for each parameter (83 => 24GB) +* Adafactor optimizer will need more than 12GB. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_82.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..6c3c29add3ef1640660d3e0e194f9ce4de87f6f3 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_82.txt @@ -0,0 +1 @@ +It uses slightly more than 4 bytes for each parameter, so 43 and then some extra. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_83.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b18fc2ddfbf644d31fa54eec4541da1dbeb64d4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_83.txt @@ -0,0 +1 @@ +* 8bit BNB quantized optimizer will use only (2*3) 6GB if all optimizer states are quantized. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_84.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..af1b2d4b6c47cd8f0763e2e301363a0bd7e939be --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_84.txt @@ -0,0 +1,2 @@ +Adafactor +Adafactor doesn't store rolling averages for each element in weight matrices. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_85.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..facb1299e4d3fcb1fbdff2c6baed7a5258cd655a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_85.txt @@ -0,0 +1,2 @@ +Instead, it keeps aggregated information +(sums of rolling averages row- and column-wise), significantly reducing its footprint. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_86.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..53563ce7c6af7418bad7fb57b7c8a554265342f6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_86.txt @@ -0,0 +1,2 @@ +However, compared to Adam, +Adafactor may have slower convergence in certain cases. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_87.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..ff5eba43d42309ffba7518ec5d5c6914bb691e26 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_87.txt @@ -0,0 +1,5 @@ +You can switch to Adafactor by setting optim="adafactor" in [TrainingArguments]: +py +training_args = TrainingArguments(per_device_train_batch_size=4, optim="adafactor", **default_args) +Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training) +you can notice up to 3x improvement while maintaining the throughput! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_88.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..47d5772aae11b12ae1a2a581df83162a19350c3b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_88.txt @@ -0,0 +1,2 @@ +However, as mentioned before, the convergence of +Adafactor can be worse than Adam. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_89.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..93c84603807f1f2022190841c0ed05558a5045e5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_89.txt @@ -0,0 +1,2 @@ +8-bit Adam +Instead of aggregating optimizer states like Adafactor, 8-bit Adam keeps the full state and quantizes it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_9.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..fd15388932dc2cb787e74134e2f89d9a1a46d6fe --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_9.txt @@ -0,0 +1,2 @@ +As part of +hyperparameter tuning, you should determine which batch size yields the best results and then optimize resources accordingly. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_90.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..879fd57d48e53d73b7406a3d2300afe3916582ee --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_90.txt @@ -0,0 +1,2 @@ +Quantization +means that it stores the state with lower precision and dequantizes it only for the optimization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_91.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..53c6d35516bbd6c9057ade2bd5779fd084676856 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_91.txt @@ -0,0 +1,2 @@ +This is similar to the +idea behind mixed precision training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_92.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_92.txt new file mode 100644 index 0000000000000000000000000000000000000000..638d3f93540f0b80bac65086fd90f2d58ef0ba3c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_92.txt @@ -0,0 +1,4 @@ +To use adamw_bnb_8bit, you simply need to set optim="adamw_bnb_8bit" in [TrainingArguments]: +py +training_args = TrainingArguments(per_device_train_batch_size=4, optim="adamw_bnb_8bit", **default_args) +However, we can also use a third-party implementation of the 8-bit optimizer for demonstration purposes to see how that can be integrated. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_93.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_93.txt new file mode 100644 index 0000000000000000000000000000000000000000..5129d713906bf3dd7bc6d38937edcb3d970496a6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_93.txt @@ -0,0 +1,2 @@ +First, follow the installation guide in the GitHub repo to install the bitsandbytes library +that implements the 8-bit Adam optimizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_94.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_94.txt new file mode 100644 index 0000000000000000000000000000000000000000..db963c4dc99322872e72c11db7904017917f5763 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_94.txt @@ -0,0 +1 @@ +Next you need to initialize the optimizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_95.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_95.txt new file mode 100644 index 0000000000000000000000000000000000000000..ab5969d708114eb1ff866c8235fe737a171981df --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_95.txt @@ -0,0 +1,2 @@ +This involves two steps: +* First, group the model's parameters into two groups - one where weight decay should be applied, and the other one where it should not. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_96.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_96.txt new file mode 100644 index 0000000000000000000000000000000000000000..202d84130db72be1e28c3f3332dd1d18f3610358 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_96.txt @@ -0,0 +1 @@ +Usually, biases and layer norm parameters are not weight decayed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_97.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_97.txt new file mode 100644 index 0000000000000000000000000000000000000000..2ebb8cbb84a989ac48b097241b7d12ff4e1800ae --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_97.txt @@ -0,0 +1 @@ +* Then do some argument housekeeping to use the same parameters as the previously used AdamW optimizer. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_98.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_98.txt new file mode 100644 index 0000000000000000000000000000000000000000..29ed74e7e3148d6e353659170fd3a7146d5e58b8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_98.txt @@ -0,0 +1,33 @@ +import bitsandbytes as bnb +from torch import nn +from transformers.trainer_pt_utils import get_parameter_names +training_args = TrainingArguments(per_device_train_batch_size=4, **default_args) +decay_parameters = get_parameter_names(model, [nn.LayerNorm]) +decay_parameters = [name for name in decay_parameters if "bias" not in name] +optimizer_grouped_parameters = [ + { + "params": [p for n, p in model.named_parameters() if n in decay_parameters], + "weight_decay": training_args.weight_decay, + }, + { + "params": [p for n, p in model.named_parameters() if n not in decay_parameters], + "weight_decay": 0.0, + }, +] +optimizer_kwargs = { + "betas": (training_args.adam_beta1, training_args.adam_beta2), + "eps": training_args.adam_epsilon, +} +optimizer_kwargs["lr"] = training_args.learning_rate +adam_bnb_optim = bnb.optim.Adam8bit( + optimizer_grouped_parameters, + betas=(training_args.adam_beta1, training_args.adam_beta2), + eps=training_args.adam_epsilon, + lr=training_args.learning_rate, +) + +Finally, pass the custom optimizer as an argument to the Trainer: +py +trainer = Trainer(model=model, args=training_args, train_dataset=ds, optimizers=(adam_bnb_optim, None)) +Combined with other approaches (gradient accumulation, gradient checkpointing, and mixed precision training), +you can expect to get about a 3x memory improvement and even slightly higher throughput as using Adafactor. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_gpu_one/chunk_99.txt b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_99.txt new file mode 100644 index 0000000000000000000000000000000000000000..c50e220ca7cf6488b585ff4b4ee31dc1daa4af21 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_gpu_one/chunk_99.txt @@ -0,0 +1,3 @@ +multi_tensor +pytorch-nightly introduced torch.optim._multi_tensor which should significantly speed up the optimizers for situations +with lots of small feature tensors. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_0.txt b/chunked/nltk_chunking/_perf_train_special/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..d4a37ca7694fc9dfa9c3185ddd0e821e207cf186 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_0.txt @@ -0,0 +1,3 @@ + +PyTorch training on Apple silicon +Previously, training models on a Mac was limited to the CPU only. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_1.txt b/chunked/nltk_chunking/_perf_train_special/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..1e5e115a477d1684daa0cc4695916b521b386f09 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_1.txt @@ -0,0 +1 @@ +With the release of PyTorch v1.12, you can take advantage of training models with Apple's silicon GPUs for significantly faster performance and training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_10.txt b/chunked/nltk_chunking/_perf_train_special/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b519cb7f8b35b092a05bebdcb1a67412e199796 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_10.txt @@ -0,0 +1 @@ +For example, you can run the run_glue.py script with the MPS backend automatically enabled without making any changes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_11.txt b/chunked/nltk_chunking/_perf_train_special/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..0b8bceee900f42929b1a9bad9477cbd219f266ed --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_11.txt @@ -0,0 +1,15 @@ +export TASK_NAME=mrpc +python examples/pytorch/text-classification/run_glue.py \ + --model_name_or_path google-bert/bert-base-cased \ + --task_name $TASK_NAME \ +- --use_mps_device \ + --do_train \ + --do_eval \ + --max_seq_length 128 \ + --per_device_train_batch_size 32 \ + --learning_rate 2e-5 \ + --num_train_epochs 3 \ + --output_dir /tmp/$TASK_NAME/ \ + --overwrite_output_dir + +Backends for distributed setups like gloo and nccl are not supported by the mps device which means you can only train on a single GPU with the MPS backend. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_12.txt b/chunked/nltk_chunking/_perf_train_special/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..82841c3e97057c015600efc08e241a81fa93700f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_12.txt @@ -0,0 +1 @@ +You can learn more about the MPS backend in the Introducing Accelerated PyTorch Training on Mac blog post. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_2.txt b/chunked/nltk_chunking/_perf_train_special/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..d1e6eef46496fb598e098cd4716f5b830df11c1f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_2.txt @@ -0,0 +1 @@ +This is powered in PyTorch by integrating Apple's Metal Performance Shaders (MPS) as a backend. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_3.txt b/chunked/nltk_chunking/_perf_train_special/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..486cf3217b0eca26c1ce2bea66ba61fc9c7321b4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_3.txt @@ -0,0 +1 @@ +The MPS backend implements PyTorch operations as custom Metal shaders and places these modules on a mps device. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_4.txt b/chunked/nltk_chunking/_perf_train_special/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..9768a348288efc170e261e7801299035de1c3472 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_4.txt @@ -0,0 +1 @@ +Some PyTorch operations are not implemented in MPS yet and will throw an error. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_5.txt b/chunked/nltk_chunking/_perf_train_special/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..edd15974b002e15545e7872c0c86f38dfbcf4f30 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_5.txt @@ -0,0 +1 @@ +To avoid this, you should set the environment variable PYTORCH_ENABLE_MPS_FALLBACK=1 to use the CPU kernels instead (you'll still see a UserWarning). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_6.txt b/chunked/nltk_chunking/_perf_train_special/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea21084d9f4af84694ac19e7b6849a279fb5c5ba --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_6.txt @@ -0,0 +1 @@ +If you run into any other errors, please open an issue in the PyTorch repository because the [Trainer] only integrates the MPS backend. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_7.txt b/chunked/nltk_chunking/_perf_train_special/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..bdb05fb14b9073e73d7f8247d45d3470e649eb2d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_7.txt @@ -0,0 +1,7 @@ +With the mps device set, you can: + +train larger networks or batch sizes locally +reduce data retrieval latency because the GPU's unified memory architecture allows direct access to the full memory store +reduce costs because you don't need to train on cloud-based GPUs or add additional local GPUs + +Get started by making sure you have PyTorch installed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_8.txt b/chunked/nltk_chunking/_perf_train_special/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..067ba83c4c34b68cf569e5f224b2e3ac028dfd72 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_8.txt @@ -0,0 +1 @@ +MPS acceleration is supported on macOS 12.3+. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_special/chunk_9.txt b/chunked/nltk_chunking/_perf_train_special/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..dd15f29624342413432ae5664ddc91c1fdbf76e9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_special/chunk_9.txt @@ -0,0 +1,2 @@ +pip install torch torchvision torchaudio +[TrainingArguments] uses the mps device by default if it's available which means you don't need to explicitly set the device. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_0.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..cd21c8427b754f9a50ee47fcc7cc28a08f011881 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_0.txt @@ -0,0 +1,4 @@ + +Training on TPU with TensorFlow + +If you don't need long explanations and just want TPU code samples to get started with, check out our TPU example notebook! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_1.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..e5e5df4aa31f402d776df4e0be014b85327253fd --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_1.txt @@ -0,0 +1 @@ +What is a TPU? \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_10.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b77c2af888a1d7dc201eeb6fb064e2fc5a5fd93 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_10.txt @@ -0,0 +1 @@ +New users are often very confused by the range of TPUs, and the different ways to access them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_11.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..7be5ee21a74335acfb450cda1bd3b659812127f5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_11.txt @@ -0,0 +1 @@ +The first key distinction to understand is the difference between TPU Nodes and TPU VMs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_12.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..df310013d82ccc88ee20516be9a26861df8c568b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_12.txt @@ -0,0 +1 @@ +When you use a TPU Node, you are effectively indirectly accessing a remote TPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_13.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..a18f5a8bbc07b29771a2f77db366732f27084c5f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_13.txt @@ -0,0 +1 @@ +You will need a separate VM, which will initialize your network and data pipeline and then forward them to the remote node. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_14.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..cbebb11791d3328ae79fe6786e8e3d12019aac40 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_14.txt @@ -0,0 +1 @@ +When you use a TPU on Google Colab, you are accessing it in the TPU Node style. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_15.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..520381111e0dbc20c7222866f26a032fa7f533d9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_15.txt @@ -0,0 +1 @@ +Using TPU Nodes can have some quite unexpected behaviour for people who aren’t used to them! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_16.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..86cddec5f21f8fd89e0babf28862f5f5898b1d96 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_16.txt @@ -0,0 +1 @@ +In particular, because the TPU is located on a physically different system to the machine you’re running your Python code on, your data cannot be local to your machine - any data pipeline that loads from your machine’s internal storage will totally fail! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_17.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..6598b3722558a690959a52e1d4a0ed6c2016eeb4 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_17.txt @@ -0,0 +1 @@ +Instead, data must be stored in Google Cloud Storage where your data pipeline can still access it, even when the pipeline is running on the remote TPU node. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_18.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..e08350e901047720ee3ec2f83abecd64d0ebb4d7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_18.txt @@ -0,0 +1 @@ +If you can fit all your data in memory as np.ndarray or tf.Tensor, then you can fit() on that data even when using Colab or a TPU Node, without needing to upload it to Google Cloud Storage. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_19.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..479de7122884f7d6d3ad82be764ebb08c9316a1c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_19.txt @@ -0,0 +1 @@ +🤗Specific Hugging Face Tip🤗: The methods Dataset.to_tf_dataset() and its higher-level wrapper model.prepare_tf_dataset() , which you will see throughout our TF code examples, will both fail on a TPU Node. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_2.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..1ae9857967b47b7bd1afdcbf3ac01b3198865ec8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_2.txt @@ -0,0 +1 @@ +A TPU is a Tensor Processing Unit. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_20.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..fbf23eb26d90e218312213f911968e4e41e1bd42 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_20.txt @@ -0,0 +1 @@ +The reason for this is that even though they create a tf.data.Dataset it is not a “pure†tf.data pipeline and uses tf.numpy_function or Dataset.from_generator() to stream data from the underlying HuggingFace Dataset. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_21.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..c45cd06e80fb2850d7e8e2a9228a33f30079e7e8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_21.txt @@ -0,0 +1 @@ +This HuggingFace Dataset is backed by data that is on a local disc and which the remote TPU Node will not be able to read. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_22.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..5eec2b67781228ee6a0d75188b3ff1d533644e6d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_22.txt @@ -0,0 +1 @@ +The second way to access a TPU is via a TPU VM. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_23.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_23.txt new file mode 100644 index 0000000000000000000000000000000000000000..ea814f5b66ebc9c7996e0b7b8d2fe173db934b36 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_23.txt @@ -0,0 +1 @@ +When using a TPU VM, you connect directly to the machine that the TPU is attached to, much like training on a GPU VM. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_24.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_24.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f16c40896830695c34839b1e4c9881d9c5a86b9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_24.txt @@ -0,0 +1 @@ +TPU VMs are generally easier to work with, particularly when it comes to your data pipeline. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_25.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_25.txt new file mode 100644 index 0000000000000000000000000000000000000000..4c1fa35d9d443378e85c0df65a5d8f1a9cac4170 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_25.txt @@ -0,0 +1 @@ +All of the above warnings do not apply to TPU VMs! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_26.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_26.txt new file mode 100644 index 0000000000000000000000000000000000000000..50f8766ca14a8879cca9bb8ac32ada0e6457b9b6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_26.txt @@ -0,0 +1 @@ +This is an opinionated document, so here’s our opinion: Avoid using TPU Node if possible. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_27.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_27.txt new file mode 100644 index 0000000000000000000000000000000000000000..96fd6671fbc664dec05a402ee22d7925a7cefde2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_27.txt @@ -0,0 +1 @@ +It is more confusing and more difficult to debug than TPU VMs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_28.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_28.txt new file mode 100644 index 0000000000000000000000000000000000000000..a7da167a075995014f73eea9f06886bbda62c9eb --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_28.txt @@ -0,0 +1 @@ +It is also likely to be unsupported in future - Google’s latest TPU, TPUv4, can only be accessed as a TPU VM, which suggests that TPU Nodes are increasingly going to become a “legacy†access method. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_29.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_29.txt new file mode 100644 index 0000000000000000000000000000000000000000..cc7c9a426718dcfb3b8917c15d167d56f27f7861 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_29.txt @@ -0,0 +1 @@ +However, we understand that the only free TPU access is on Colab and Kaggle Kernels, which uses TPU Node - so we’ll try to explain how to handle it if you have to! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_3.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..7de0c42ea771b562bdfff0ef28c0c317f7daeae1 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_3.txt @@ -0,0 +1 @@ +They are hardware designed by Google, which are used to greatly speed up the tensor computations within neural networks, much like GPUs. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_30.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_30.txt new file mode 100644 index 0000000000000000000000000000000000000000..b4c4bf9175fea373b81d1b13f77f52ce3b733acb --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_30.txt @@ -0,0 +1 @@ +Check the TPU example notebook for code samples that explain this in more detail. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_31.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_31.txt new file mode 100644 index 0000000000000000000000000000000000000000..a72c35f597f9a4e9c5d5f3f026cbf57b2fcc79db --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_31.txt @@ -0,0 +1 @@ +What sizes of TPU are available? \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_32.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_32.txt new file mode 100644 index 0000000000000000000000000000000000000000..80b77dd6231ea9716f09057b1754af5eed42e053 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_32.txt @@ -0,0 +1 @@ +A single TPU (a v2-8/v3-8/v4-8) runs 8 replicas. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_33.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_33.txt new file mode 100644 index 0000000000000000000000000000000000000000..57263608bc9236e0768fd1ca422f9f813d959293 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_33.txt @@ -0,0 +1 @@ +TPUs exist in pods that can run hundreds or thousands of replicas simultaneously. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_34.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_34.txt new file mode 100644 index 0000000000000000000000000000000000000000..c7f53b7e975354cf435c721acc93b011ca5ef815 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_34.txt @@ -0,0 +1 @@ +When you use more than a single TPU but less than a whole pod (for example, a v3-32), your TPU fleet is referred to as a pod slice. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_35.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_35.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f6cb30bd4a0ee2edf096ffd0a057e3fad405bb2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_35.txt @@ -0,0 +1 @@ +When you access a free TPU via Colab, you generally get a single v2-8 TPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_36.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_36.txt new file mode 100644 index 0000000000000000000000000000000000000000..3f4b5a1100cbe1b732266ec98bc4b918b60a9682 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_36.txt @@ -0,0 +1 @@ +I keep hearing about this XLA thing. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_37.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_37.txt new file mode 100644 index 0000000000000000000000000000000000000000..92b9e36433acb742a9a7ba7e42c6870c464981c2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_37.txt @@ -0,0 +1 @@ +What’s XLA, and how does it relate to TPUs? \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_38.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_38.txt new file mode 100644 index 0000000000000000000000000000000000000000..47b6d3d2ea979f3f0e0db72170f38d4f0b79d419 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_38.txt @@ -0,0 +1 @@ +XLA is an optimizing compiler, used by both TensorFlow and JAX. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_39.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_39.txt new file mode 100644 index 0000000000000000000000000000000000000000..df956394f1bca49ea5f75394aaf951c588ae05a8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_39.txt @@ -0,0 +1 @@ +In JAX it is the only compiler, whereas in TensorFlow it is optional (but mandatory on TPU!). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_4.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..197824187881bd7f82aa2a10729c7312ecee5f62 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_4.txt @@ -0,0 +1 @@ +They can be used for both network training and inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_40.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_40.txt new file mode 100644 index 0000000000000000000000000000000000000000..1c236d91e3ec7b8672fb84b184f3428de5195ae5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_40.txt @@ -0,0 +1 @@ +The easiest way to enable it when training a Keras model is to pass the argument jit_compile=True to model.compile(). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_41.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_41.txt new file mode 100644 index 0000000000000000000000000000000000000000..f5f01745ea6a78fd1d36373df1b9e07f9c89767d --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_41.txt @@ -0,0 +1 @@ +If you don’t get any errors and performance is good, that’s a great sign that you’re ready to move to TPU! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_42.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_42.txt new file mode 100644 index 0000000000000000000000000000000000000000..916a9fc5cde060907e050ab17ea62ed28916e36e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_42.txt @@ -0,0 +1 @@ +Debugging on TPU is generally a bit harder than on CPU/GPU, so we recommend getting your code running on CPU/GPU with XLA first before trying it on TPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_43.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_43.txt new file mode 100644 index 0000000000000000000000000000000000000000..f8012d54f32b5e1d5da344183972aa79e728e904 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_43.txt @@ -0,0 +1 @@ +You don’t have to train for long, of course - just for a few steps to make sure that your model and data pipeline are working like you expect them to. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_44.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_44.txt new file mode 100644 index 0000000000000000000000000000000000000000..49464c3469b8c3fff32afe710bec921d15205a95 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_44.txt @@ -0,0 +1 @@ +XLA compiled code is usually faster - so even if you’re not planning to run on TPU, adding jit_compile=True can improve your performance. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_45.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_45.txt new file mode 100644 index 0000000000000000000000000000000000000000..643d37b0f5298763d5eb27e1323147ab37eebd36 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_45.txt @@ -0,0 +1 @@ +Be sure to note the caveats below about XLA compatibility, though! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_46.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_46.txt new file mode 100644 index 0000000000000000000000000000000000000000..b5bfc36827bb689455601cf0ac58e637c3d11312 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_46.txt @@ -0,0 +1 @@ +Tip born of painful experience: Although using jit_compile=True is a good way to get a speed boost and test if your CPU/GPU code is XLA-compatible, it can actually cause a lot of problems if you leave it in when actually training on TPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_47.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_47.txt new file mode 100644 index 0000000000000000000000000000000000000000..251824f6f8cddc1a55fd8c7cbb094a2a3c8bd97a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_47.txt @@ -0,0 +1 @@ +XLA compilation will happen implicitly on TPU, so remember to remove that line before actually running your code on a TPU! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_48.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_48.txt new file mode 100644 index 0000000000000000000000000000000000000000..923675006180294f434e247c5e302f4353f2eafe --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_48.txt @@ -0,0 +1 @@ +How do I make my model XLA compatible? \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_49.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_49.txt new file mode 100644 index 0000000000000000000000000000000000000000..2b60b1bf5d2eac1867912f0fcb01e584938361c6 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_49.txt @@ -0,0 +1 @@ +In many cases, your code is probably XLA-compatible already! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_5.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..60838e14ea5ccaff454fdab4adae23d992977126 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_5.txt @@ -0,0 +1 @@ +They are generally accessed through Google’s cloud services, but small TPUs can also be accessed directly for free through Google Colab and Kaggle Kernels. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_50.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_50.txt new file mode 100644 index 0000000000000000000000000000000000000000..9bdb68b654123597a4784e14650479c90da468ae --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_50.txt @@ -0,0 +1 @@ +However, there are a few things that work in normal TensorFlow that don’t work in XLA. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_51.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_51.txt new file mode 100644 index 0000000000000000000000000000000000000000..eb454b89de8d54320e22c897df77ea689e94bbfe --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_51.txt @@ -0,0 +1,3 @@ +We’ve distilled them into three core rules below: + +🤗Specific HuggingFace Tip🤗: We’ve put a lot of effort into rewriting our TensorFlow models and loss functions to be XLA-compatible. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_52.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_52.txt new file mode 100644 index 0000000000000000000000000000000000000000..28bdd39e3e4a1e36965a9a29af0b4e4921f13f13 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_52.txt @@ -0,0 +1 @@ +Our models and loss functions generally obey rule #1 and #2 by default, so you can skip over them if you’re using transformers models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_53.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_53.txt new file mode 100644 index 0000000000000000000000000000000000000000..e065f16b4d31103302348996a3540f7e4af4784c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_53.txt @@ -0,0 +1 @@ +Don’t forget about these rules when writing your own models and loss functions, though! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_54.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_54.txt new file mode 100644 index 0000000000000000000000000000000000000000..a957bbb710f28a409529dda5933e46885e326f1a --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_54.txt @@ -0,0 +1,2 @@ +XLA Rule #1: Your code cannot have “data-dependent conditionals†+What that means is that any if statement cannot depend on values inside a tf.Tensor. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_55.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_55.txt new file mode 100644 index 0000000000000000000000000000000000000000..c218be9c43e6921f3d4c9dc62a662de1ce308779 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_55.txt @@ -0,0 +1 @@ +For example, this code block cannot be compiled with XLA! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_56.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_56.txt new file mode 100644 index 0000000000000000000000000000000000000000..39391b0d16ab48ac8210b13aac9e7029839f8871 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_56.txt @@ -0,0 +1,4 @@ +python +if tf.reduce_sum(tensor) > 10: + tensor = tensor / 2.0 +This might seem very restrictive at first, but most neural net code doesn’t need to do this. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_57.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_57.txt new file mode 100644 index 0000000000000000000000000000000000000000..d49bf107f7102979e709a8110e986edffa53b0f2 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_57.txt @@ -0,0 +1,5 @@ +You can often get around this restriction by using tf.cond (see the documentation here) or by removing the conditional and finding a clever math trick with indicator variables instead, like so: +python +sum_over_10 = tf.cast(tf.reduce_sum(tensor) > 10, tf.float32) +tensor = tensor / (1.0 + sum_over_10) +This code has exactly the same effect as the code above, but by avoiding a conditional, we ensure it will compile with XLA without problems! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_58.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_58.txt new file mode 100644 index 0000000000000000000000000000000000000000..8223c449d34f88b8a300ea8cceeac497d3c5e7a8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_58.txt @@ -0,0 +1,2 @@ +XLA Rule #2: Your code cannot have “data-dependent shapes†+What this means is that the shape of all of the tf.Tensor objects in your code cannot depend on their values. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_59.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_59.txt new file mode 100644 index 0000000000000000000000000000000000000000..94a8578248c072f5558c5baff259df09a76f4294 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_59.txt @@ -0,0 +1 @@ +For example, the function tf.unique cannot be compiled with XLA, because it returns a tensor containing one instance of each unique value in the input. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_6.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..8a3dc18c5f0b833e476b429af3f8c8de1bd88c67 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_6.txt @@ -0,0 +1 @@ +Because all TensorFlow models in 🤗 Transformers are Keras models, most of the methods in this document are generally applicable to TPU training for any Keras model! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_60.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_60.txt new file mode 100644 index 0000000000000000000000000000000000000000..e4baf1c9647169d0faba090c90f6d3ec49bd9b85 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_60.txt @@ -0,0 +1 @@ +The shape of this output will obviously be different depending on how repetitive the input Tensor was, and so XLA refuses to handle it! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_61.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_61.txt new file mode 100644 index 0000000000000000000000000000000000000000..715314e5fa0ebe5da968df94055cb2cd1c4a749b --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_61.txt @@ -0,0 +1 @@ +In general, most neural network code obeys rule #2 by default. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_62.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_62.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee0d1cb0369662774fa59cf659e693596481f302 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_62.txt @@ -0,0 +1 @@ +However, there are a few common cases where it becomes a problem. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_63.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_63.txt new file mode 100644 index 0000000000000000000000000000000000000000..a512589d18a7ce649941ea7005dbbad699514c7c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_63.txt @@ -0,0 +1 @@ +One very common one is when you use label masking, setting your labels to a negative value to indicate that those positions should be ignored when computing the loss. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_64.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_64.txt new file mode 100644 index 0000000000000000000000000000000000000000..2fee1a90bcf1bf2f6645d2faa32d3e80a37f6cf5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_64.txt @@ -0,0 +1,8 @@ +If you look at NumPy or PyTorch loss functions that support label masking, you will often see code like this that uses boolean indexing: +python +label_mask = labels >= 0 +masked_outputs = outputs[label_mask] +masked_labels = labels[label_mask] +loss = compute_loss(masked_outputs, masked_labels) +mean_loss = torch.mean(loss) +This code is totally fine in NumPy or PyTorch, but it breaks in XLA! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_65.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_65.txt new file mode 100644 index 0000000000000000000000000000000000000000..f9e07966c04eee20748a971960433112b34835f9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_65.txt @@ -0,0 +1 @@ +Why? \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_66.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_66.txt new file mode 100644 index 0000000000000000000000000000000000000000..cce6d0bd5bb9a816422310ebd1c556b5138a04de --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_66.txt @@ -0,0 +1 @@ +Because the shape of masked_outputs and masked_labels depends on how many positions are masked - that makes it a data-dependent shape. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_67.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_67.txt new file mode 100644 index 0000000000000000000000000000000000000000..25fe4a15cda62cc36ca2b54cb15889b1235ebbc8 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_67.txt @@ -0,0 +1 @@ +However, just like for rule #1, we can often rewrite this code to yield exactly the same output without any data-dependent shapes. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_68.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_68.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf79dfc3a01e3b69e91500b381622e3a96f6d708 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_68.txt @@ -0,0 +1,6 @@ +python +label_mask = tf.cast(labels >= 0, tf.float32) +loss = compute_loss(outputs, labels) +loss = loss * label_mask # Set negative label positions to 0 +mean_loss = tf.reduce_sum(loss) / tf.reduce_sum(label_mask) +Here, we avoid data-dependent shapes by computing the loss for every position, but zeroing out the masked positions in both the numerator and denominator when we calculate the mean, which yields exactly the same result as the first block while maintaining XLA compatibility. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_69.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_69.txt new file mode 100644 index 0000000000000000000000000000000000000000..5827468182e4cab97f0f9654ca14005fffe1bd55 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_69.txt @@ -0,0 +1 @@ +Note that we use the same trick as in rule #1 - converting a tf.bool to tf.float32 and using it as an indicator variable. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_7.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..ecf4ba87c1e39ff0812389211d54faeab76e8e76 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_7.txt @@ -0,0 +1 @@ +However, there are a few points that are specific to the HuggingFace ecosystem (hug-o-system?) \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_70.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_70.txt new file mode 100644 index 0000000000000000000000000000000000000000..3390f96c85d6dfc2b3d32b30d79dbf474d199131 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_70.txt @@ -0,0 +1 @@ +This is a really useful trick, so remember it if you need to convert your own code to XLA! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_71.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_71.txt new file mode 100644 index 0000000000000000000000000000000000000000..660c02d1b1d781538657a8e7eedd7f34e8229da5 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_71.txt @@ -0,0 +1,2 @@ +XLA Rule #3: XLA will need to recompile your model for every different input shape it sees +This is the big one. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_72.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_72.txt new file mode 100644 index 0000000000000000000000000000000000000000..ffc58c646f88e0d6253f4d4d68b2c2db919f9317 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_72.txt @@ -0,0 +1 @@ +What this means is that if your input shapes are very variable, XLA will have to recompile your model over and over, which will create huge performance problems. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_73.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_73.txt new file mode 100644 index 0000000000000000000000000000000000000000..475c2a8c18ba4e5740903be901d69b2d1395d0c7 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_73.txt @@ -0,0 +1 @@ +This commonly arises in NLP models, where input texts have variable lengths after tokenization. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_74.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_74.txt new file mode 100644 index 0000000000000000000000000000000000000000..4213a43772e5344eb7f3f93b7382aae45f424330 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_74.txt @@ -0,0 +1 @@ +In other modalities, static shapes are more common and this rule is much less of a problem. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_75.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_75.txt new file mode 100644 index 0000000000000000000000000000000000000000..857b77fd83ef62aff5dbfd1eade6dabe2e983453 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_75.txt @@ -0,0 +1 @@ +How can you get around rule #3? \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_76.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_76.txt new file mode 100644 index 0000000000000000000000000000000000000000..81cc59e37d132fe683609e58f3f06d0d43656153 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_76.txt @@ -0,0 +1 @@ +The key is padding - if you pad all your inputs to the same length, and then use an attention_mask, you can get the same results as you’d get from variable shapes, but without any XLA issues. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_77.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_77.txt new file mode 100644 index 0000000000000000000000000000000000000000..6a44260438725c1653d7de7bf1b226fb742048ed --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_77.txt @@ -0,0 +1 @@ +However, excessive padding can cause severe slowdown too - if you pad all your samples to the maximum length in the whole dataset, you might end up with batches consisting endless padding tokens, which will waste a lot of compute and memory! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_78.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_78.txt new file mode 100644 index 0000000000000000000000000000000000000000..93758fa9b0c7fb3682fa80cfdfc35e48c0c4ec6f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_78.txt @@ -0,0 +1 @@ +There isn’t a perfect solution to this problem. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_79.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_79.txt new file mode 100644 index 0000000000000000000000000000000000000000..e6603684f97e72e889de364d15d20358e1e7f726 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_79.txt @@ -0,0 +1 @@ +However, you can try some tricks. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_8.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..232a0776ea42b098a4ec6ef6a9f6432a50cdb484 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_8.txt @@ -0,0 +1 @@ +of Transformers and Datasets, and we’ll make sure to flag them up when we get to them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_80.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_80.txt new file mode 100644 index 0000000000000000000000000000000000000000..8265e5bf4bda393307ab360c69b75f0acebe807e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_80.txt @@ -0,0 +1 @@ +One very useful trick is to pad batches of samples up to a multiple of a number like 32 or 64 tokens. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_81.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_81.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef08474091b81e870c48960ed3fa46d37d39058e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_81.txt @@ -0,0 +1 @@ +This often only increases the number of tokens by a small amount, but it hugely reduces the number of unique input shapes, because every input shape now has to be a multiple of 32 or 64. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_82.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_82.txt new file mode 100644 index 0000000000000000000000000000000000000000..28761c901580f1aa12a3a5c40eec660e3685ed2e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_82.txt @@ -0,0 +1 @@ +Fewer unique input shapes means fewer XLA compilations! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_83.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_83.txt new file mode 100644 index 0000000000000000000000000000000000000000..67496acfc2380ebd335b57255feb01ee22a08d43 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_83.txt @@ -0,0 +1 @@ +🤗Specific HuggingFace Tip🤗: Our tokenizers and data collators have methods that can help you here. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_84.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_84.txt new file mode 100644 index 0000000000000000000000000000000000000000..a2be89eedfcbd35d6c7f5942c6004d0053675ddb --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_84.txt @@ -0,0 +1 @@ +You can use padding="max_length" or padding="longest" when calling tokenizers to get them to output padded data. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_85.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_85.txt new file mode 100644 index 0000000000000000000000000000000000000000..29a4c16fda33d9b2d9015a251ea64316807c876c --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_85.txt @@ -0,0 +1 @@ +Our tokenizers and data collators also have a pad_to_multiple_of argument that you can use to reduce the number of unique input shapes you see! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_86.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_86.txt new file mode 100644 index 0000000000000000000000000000000000000000..8151be8a1c68718d0aff975e92466305fb4edeee --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_86.txt @@ -0,0 +1 @@ +How do I actually train my model on TPU? \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_87.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_87.txt new file mode 100644 index 0000000000000000000000000000000000000000..f97c9a2b8ddd890bbe6a660834a8f041fa526f9f --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_87.txt @@ -0,0 +1 @@ +Once your training is XLA-compatible and (if you’re using TPU Node / Colab) your dataset has been prepared appropriately, running on TPU is surprisingly easy! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_88.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_88.txt new file mode 100644 index 0000000000000000000000000000000000000000..7e4d24cfe9d833ae275808e1f6b4835fdd7b8560 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_88.txt @@ -0,0 +1 @@ +All you really need to change in your code is to add a few lines to initialize your TPU, and to ensure that your model and dataset are created inside a TPUStrategy scope. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_89.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_89.txt new file mode 100644 index 0000000000000000000000000000000000000000..b69b13eeaf351ab7cf1b40193204c9c27e4c05d9 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_89.txt @@ -0,0 +1 @@ +Take a look at our TPU example notebook to see this in action! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_9.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..310ed0bb628557e9c61c19f23bbc24a3e04af37e --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_9.txt @@ -0,0 +1 @@ +What kinds of TPU are available? \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_90.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_90.txt new file mode 100644 index 0000000000000000000000000000000000000000..97556850e1a9179d369bfa9296d5ac232d9499f0 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_90.txt @@ -0,0 +1,10 @@ +Summary +There was a lot in here, so let’s summarize with a quick checklist you can follow when you want to get your model ready for TPU training: + +Make sure your code follows the three rules of XLA +Compile your model with jit_compile=True on CPU/GPU and confirm that you can train it with XLA +Either load your dataset into memory or use a TPU-compatible dataset loading approach (see notebook) +Migrate your code either to Colab (with accelerator set to “TPUâ€) or a TPU VM on Google Cloud +Add TPU initializer code (see notebook) +Create your TPUStrategy and make sure dataset loading and model creation are inside the strategy.scope() (see notebook) +Don’t forget to take jit_compile=True out again when you move to TPU! \ No newline at end of file diff --git a/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_91.txt b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_91.txt new file mode 100644 index 0000000000000000000000000000000000000000..32beca617ddf96ed0e1522966dd012a991f33145 --- /dev/null +++ b/chunked/nltk_chunking/_perf_train_tpu_tf/chunk_91.txt @@ -0,0 +1,3 @@ +ðŸ™ðŸ™ðŸ™ðŸ¥ºðŸ¥ºðŸ¥º +Call model.fit() +You did it! \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_0.txt b/chunked/nltk_chunking/_performance/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..8b69f96d643e1f823c3f1888871ea2c79bca8457 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_0.txt @@ -0,0 +1,3 @@ + +Performance and Scalability +Training large transformer models and deploying them to production present various challenges. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_1.txt b/chunked/nltk_chunking/_performance/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..95cc0b8d013a9556f19953243acf49ed2da34741 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_1.txt @@ -0,0 +1 @@ +During training, the model may require more GPU memory than available or exhibit slow training speed. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_10.txt b/chunked/nltk_chunking/_performance/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..99f6029256affd3555c96370bdca0513fc42bbf8 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_10.txt @@ -0,0 +1 @@ +However, there are also techniques that are specific to multi-GPU or CPU training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_11.txt b/chunked/nltk_chunking/_performance/chunk_11.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef8a770d4a8f7830fd157d5211cde58e2d98ab8a --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_11.txt @@ -0,0 +1,2 @@ +We cover them in +separate sections. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_12.txt b/chunked/nltk_chunking/_performance/chunk_12.txt new file mode 100644 index 0000000000000000000000000000000000000000..9b2b47c8e6dc26f4d3e3f12805e31738543202ed --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_12.txt @@ -0,0 +1 @@ +Methods and tools for efficient training on a single GPU: start here to learn common approaches that can help optimize GPU memory utilization, speed up the training, or both. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_13.txt b/chunked/nltk_chunking/_performance/chunk_13.txt new file mode 100644 index 0000000000000000000000000000000000000000..c723b70d56dfb16272374f7b094b90597025320b --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_13.txt @@ -0,0 +1 @@ +Multi-GPU training section: explore this section to learn about further optimization methods that apply to a multi-GPU settings, such as data, tensor, and pipeline parallelism. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_14.txt b/chunked/nltk_chunking/_performance/chunk_14.txt new file mode 100644 index 0000000000000000000000000000000000000000..96651029667eefd27f8329014b4136e9718e636f --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_14.txt @@ -0,0 +1 @@ +CPU training section: learn about mixed precision training on CPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_15.txt b/chunked/nltk_chunking/_performance/chunk_15.txt new file mode 100644 index 0000000000000000000000000000000000000000..0179fe3fde751268fd3d4b7c454a3e75506fccb4 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_15.txt @@ -0,0 +1 @@ +Efficient Training on Multiple CPUs: learn about distributed CPU training. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_16.txt b/chunked/nltk_chunking/_performance/chunk_16.txt new file mode 100644 index 0000000000000000000000000000000000000000..7d0ea5bb8290811708aa49b7c8a35a7bc745389b --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_16.txt @@ -0,0 +1 @@ +Training on TPU with TensorFlow: if you are new to TPUs, refer to this section for an opinionated introduction to training on TPUs and using XLA. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_17.txt b/chunked/nltk_chunking/_performance/chunk_17.txt new file mode 100644 index 0000000000000000000000000000000000000000..78509d0d00f6622da2a3f4307b26264585714013 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_17.txt @@ -0,0 +1 @@ +Custom hardware for training: find tips and tricks when building your own deep learning rig. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_18.txt b/chunked/nltk_chunking/_performance/chunk_18.txt new file mode 100644 index 0000000000000000000000000000000000000000..fef3cd89bc685b44742edf3a53d6341ea0897de1 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_18.txt @@ -0,0 +1,4 @@ +Hyperparameter Search using Trainer API + +Inference +Efficient inference with large models in a production environment can be as challenging as training them. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_19.txt b/chunked/nltk_chunking/_performance/chunk_19.txt new file mode 100644 index 0000000000000000000000000000000000000000..43a430aba2f2cfedacea94a9771edbd74528931c --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_19.txt @@ -0,0 +1,2 @@ +In the following +sections we go through the steps to run inference on CPU and single/multi-GPU setups. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_2.txt b/chunked/nltk_chunking/_performance/chunk_2.txt new file mode 100644 index 0000000000000000000000000000000000000000..305fc2e9a01434f80fc816186ea95451f76687f0 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_2.txt @@ -0,0 +1,2 @@ +In the deployment +phase, the model can struggle to handle the required throughput in a production environment. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_20.txt b/chunked/nltk_chunking/_performance/chunk_20.txt new file mode 100644 index 0000000000000000000000000000000000000000..5e0c0639c4700d5e0445c8048707eb4a29787e43 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_20.txt @@ -0,0 +1,7 @@ +Inference on a single CPU +Inference on a single GPU +Multi-GPU inference +XLA Integration for TensorFlow Models + +Training and inference +Here you'll find techniques, tips and tricks that apply whether you are training a model, or running inference with it. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_21.txt b/chunked/nltk_chunking/_performance/chunk_21.txt new file mode 100644 index 0000000000000000000000000000000000000000..b408144e50aaf6f3d8e8f3f6a31388753f45b929 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_21.txt @@ -0,0 +1,6 @@ +Instantiating a big model +Troubleshooting performance issues + +Contribute +This document is far from being complete and a lot more needs to be added, so if you have additions or corrections to +make please don't hesitate to open a PR or if you aren't sure start an Issue and we can discuss the details there. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_22.txt b/chunked/nltk_chunking/_performance/chunk_22.txt new file mode 100644 index 0000000000000000000000000000000000000000..100332b71102d1fd75d4177681a4b5fd2ef83fec --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_22.txt @@ -0,0 +1,2 @@ +When making contributions that A is better than B, please try to include a reproducible benchmark and/or a link to the +source of that information (unless it comes directly from you). \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_3.txt b/chunked/nltk_chunking/_performance/chunk_3.txt new file mode 100644 index 0000000000000000000000000000000000000000..ed639a38c4e510ddb9076c85f15a092080f6e65a --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_3.txt @@ -0,0 +1 @@ +This documentation aims to assist you in overcoming these challenges and finding the optimal setting for your use-case. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_4.txt b/chunked/nltk_chunking/_performance/chunk_4.txt new file mode 100644 index 0000000000000000000000000000000000000000..9f70514c0256b34b841ee30bbbb81f0a017ef0f6 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_4.txt @@ -0,0 +1 @@ +The guides are divided into training and inference sections, as each comes with different challenges and solutions. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_5.txt b/chunked/nltk_chunking/_performance/chunk_5.txt new file mode 100644 index 0000000000000000000000000000000000000000..b18e16f65b50d131f90c84eff63c03d51676170a --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_5.txt @@ -0,0 +1,2 @@ +Within each section you'll find separate guides for different hardware configurations, such as single GPU vs. multi-GPU +for training or CPU vs. GPU for inference. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_6.txt b/chunked/nltk_chunking/_performance/chunk_6.txt new file mode 100644 index 0000000000000000000000000000000000000000..97dfaa0d3e3a1190da7ae3ac59e788b27b8fa4e4 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_6.txt @@ -0,0 +1 @@ +Use this document as your starting point to navigate further to the methods that match your scenario. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_7.txt b/chunked/nltk_chunking/_performance/chunk_7.txt new file mode 100644 index 0000000000000000000000000000000000000000..c9ede54f43d2f0da14e08fd0aae41331c7333588 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_7.txt @@ -0,0 +1,2 @@ +Training +Training large transformer models efficiently requires an accelerator such as a GPU or TPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_8.txt b/chunked/nltk_chunking/_performance/chunk_8.txt new file mode 100644 index 0000000000000000000000000000000000000000..b63af78a4a81677b3acdde9e5b5d7c2abb323300 --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_8.txt @@ -0,0 +1,2 @@ +The most common case is where +you have a single GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_performance/chunk_9.txt b/chunked/nltk_chunking/_performance/chunk_9.txt new file mode 100644 index 0000000000000000000000000000000000000000..6476b417de3a9bf6e84cc96c2757d488d8cc876e --- /dev/null +++ b/chunked/nltk_chunking/_performance/chunk_9.txt @@ -0,0 +1,2 @@ +The methods that you can apply to improve training efficiency on a single GPU extend to other setups +such as multiple GPU. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perplexity/chunk_0.txt b/chunked/nltk_chunking/_perplexity/chunk_0.txt new file mode 100644 index 0000000000000000000000000000000000000000..010539956291ad322e1daa18f5fd7c9b0c977d18 --- /dev/null +++ b/chunked/nltk_chunking/_perplexity/chunk_0.txt @@ -0,0 +1,4 @@ + +Perplexity of fixed-length models +[[open-in-colab]] +Perplexity (PPL) is one of the most common metrics for evaluating language models. \ No newline at end of file diff --git a/chunked/nltk_chunking/_perplexity/chunk_1.txt b/chunked/nltk_chunking/_perplexity/chunk_1.txt new file mode 100644 index 0000000000000000000000000000000000000000..f1a724bb0d66936bf302b93f405f6285341b9cf2 --- /dev/null +++ b/chunked/nltk_chunking/_perplexity/chunk_1.txt @@ -0,0 +1,3 @@ +Before diving in, we should note +that the metric applies specifically to classical language models (sometimes called autoregressive or causal language +models) and is not well defined for masked language models like BERT (see summary of the models). \ No newline at end of file diff --git a/chunked/nltk_chunking/_perplexity/chunk_10.txt b/chunked/nltk_chunking/_perplexity/chunk_10.txt new file mode 100644 index 0000000000000000000000000000000000000000..6f273b0a84af0c9eb7702a6a32cb831dc20329fa --- /dev/null +++ b/chunked/nltk_chunking/_perplexity/chunk_10.txt @@ -0,0 +1,2 @@ +The largest version of GPT-2, for example, has a fixed length of 1024 tokens, so we +cannot calculate \(p_\theta(x_t|x_{